1 //Written in the D programming language
2 /*
3 Implementation of Thompson NFA std.regex engine.
4 Key point is evaluation of all possible threads (state) at each step
5 in a breadth-first manner, thereby geting some nice properties:
6 - looking at each character only once
7 - merging of equivalent threads, that gives matching process linear time complexity
8 */
9 module std.regex.internal.thompson;
10
11 package(std.regex):
12
13 import std.range.primitives;
14 import std.regex.internal.ir;
15
16 //State of VM thread
Thread(DataIndex)17 struct Thread(DataIndex)
18 {
19 Thread* next; //intrusive linked list
20 uint pc;
21 uint counter; //loop counter
22 uint uopCounter; //counts micro operations inside one macro instruction (e.g. BackRef)
23 Group!DataIndex[1] matches;
24 }
25
26 //head-tail singly-linked list
ThreadList(DataIndex)27 struct ThreadList(DataIndex)
28 {
29 Thread!DataIndex* tip = null, toe = null;
30 //add new thread to the start of list
31 void insertFront(Thread!DataIndex* t)
32 {
33 if (tip)
34 {
35 t.next = tip;
36 tip = t;
37 }
38 else
39 {
40 t.next = null;
41 tip = toe = t;
42 }
43 }
44 //add new thread to the end of list
45 void insertBack(Thread!DataIndex* t)
46 {
47 if (toe)
48 {
49 toe.next = t;
50 toe = t;
51 }
52 else
53 tip = toe = t;
54 toe.next = null;
55 }
56 //move head element out of list
57 Thread!DataIndex* fetch()
58 {
59 auto t = tip;
60 if (tip == toe)
61 tip = toe = null;
62 else
63 tip = tip.next;
64 return t;
65 }
66 //non-destructive iteration of ThreadList
67 struct ThreadRange
68 {
69 const(Thread!DataIndex)* ct;
70 this(ThreadList tlist){ ct = tlist.tip; }
71 @property bool empty(){ return ct is null; }
72 @property const(Thread!DataIndex)* front(){ return ct; }
73 void popFront()
74 {
75 assert(ct);
76 ct = ct.next;
77 }
78 }
79 @property bool empty()
80 {
81 return tip == null;
82 }
83 ThreadRange opSlice()
84 {
85 return ThreadRange(this);
86 }
87 }
88
89 template ThompsonOps(E, S, bool withInput:true)
90 {
91 @trusted:
92 static bool op(IR code:IR.End)(E e, S* state)
93 {
with(state)94 with(e) with(state)
95 {
96 finish(t, matches, re.ir[t.pc].data);
97 //fix endpoint of the whole match
98 matches[0].end = index;
99 recycle(t);
100 //cut off low priority threads
101 recycle(clist);
102 recycle(worklist);
103 debug(std_regex_matcher) writeln("Finished thread ", matches);
104 return false; // no more state to eval
105 }
106 }
107
108 static bool op(IR code:IR.Wordboundary)(E e, S* state)
109 {
with(state)110 with(e) with(state)
111 {
112 dchar back;
113 DataIndex bi;
114 //at start & end of input
115 if (atStart && wordMatcher[front])
116 {
117 t.pc += IRL!(IR.Wordboundary);
118 return true;
119 }
120 else if (atEnd && s.loopBack(index).nextChar(back, bi)
121 && wordMatcher[back])
122 {
123 t.pc += IRL!(IR.Wordboundary);
124 return true;
125 }
126 else if (s.loopBack(index).nextChar(back, bi))
127 {
128 bool af = wordMatcher[front];
129 bool ab = wordMatcher[back];
130 if (af ^ ab)
131 {
132 t.pc += IRL!(IR.Wordboundary);
133 return true;
134 }
135 }
136 return popState(e);
137 }
138 }
139
140 static bool op(IR code:IR.Notwordboundary)(E e, S* state)
141 {
with(state)142 with(e) with(state)
143 {
144 dchar back;
145 DataIndex bi;
146 //at start & end of input
147 if (atStart && wordMatcher[front])
148 {
149 return popState(e);
150 }
151 else if (atEnd && s.loopBack(index).nextChar(back, bi)
152 && wordMatcher[back])
153 {
154 return popState(e);
155 }
156 else if (s.loopBack(index).nextChar(back, bi))
157 {
158 bool af = wordMatcher[front];
159 bool ab = wordMatcher[back] != 0;
160 if (af ^ ab)
161 {
162 return popState(e);
163 }
164 }
165 t.pc += IRL!(IR.Notwordboundary);
166 }
167 return true;
168 }
169
170 static bool op(IR code:IR.Bof)(E e, S* state)
171 {
with(state)172 with(e) with(state)
173 {
174 if (atStart)
175 {
176 t.pc += IRL!(IR.Bof);
177 return true;
178 }
179 else
180 {
181 return popState(e);
182 }
183 }
184 }
185
186 static bool op(IR code:IR.Bol)(E e, S* state)
187 {
with(state)188 with(e) with(state)
189 {
190 dchar back;
191 DataIndex bi;
192 if (atStart
193 ||(s.loopBack(index).nextChar(back,bi)
194 && startOfLine(back, front == '\n')))
195 {
196 t.pc += IRL!(IR.Bol);
197 return true;
198 }
199 else
200 {
201 return popState(e);
202 }
203 }
204 }
205
206 static bool op(IR code:IR.Eof)(E e, S* state)
207 {
with(state)208 with(e) with(state)
209 {
210 if (atEnd)
211 {
212 t.pc += IRL!(IR.Eol);
213 return true;
214 }
215 else
216 {
217 return popState(e);
218 }
219 }
220 }
221
222 static bool op(IR code:IR.Eol)(E e, S* state)
223 {
with(state)224 with(e) with(state)
225 {
226 dchar back;
227 DataIndex bi;
228 //no matching inside \r\n
229 if (atEnd || (endOfLine(front, s.loopBack(index).nextChar(back, bi)
230 && back == '\r')))
231 {
232 t.pc += IRL!(IR.Eol);
233 return true;
234 }
235 else
236 {
237 return popState(e);
238 }
239
240 }
241 }
242
243 static bool op(IR code:IR.InfiniteStart)(E e, S* state)
244 {
245 with(e) with(state)
246 t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteStart);
247 return op!(IR.InfiniteEnd)(e,state);
248 }
249
250 static bool op(IR code:IR.InfiniteBloomStart)(E e, S* state)
251 {
252 with(e) with(state)
253 t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteBloomStart);
254 return op!(IR.InfiniteBloomEnd)(e,state);
255 }
256
257 static bool op(IR code:IR.InfiniteQStart)(E e, S* state)
258 {
259 with(e) with(state)
260 t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteQStart);
261 return op!(IR.InfiniteQEnd)(e,state);
262 }
263
264 static bool op(IR code:IR.RepeatStart)(E e, S* state)
265 {
266 with(e) with(state)
267 t.pc += re.ir[t.pc].data + IRL!(IR.RepeatStart);
268 return op!(IR.RepeatEnd)(e,state);
269 }
270
271 static bool op(IR code:IR.RepeatQStart)(E e, S* state)
272 {
273 with(e) with(state)
274 t.pc += re.ir[t.pc].data + IRL!(IR.RepeatQStart);
275 return op!(IR.RepeatQEnd)(e,state);
276 }
277
278 static bool op(IR code)(E e, S* state)
279 if (code == IR.RepeatEnd || code == IR.RepeatQEnd)
280 {
with(state)281 with(e) with(state)
282 {
283 //len, step, min, max
284 uint len = re.ir[t.pc].data;
285 uint step = re.ir[t.pc+2].raw;
286 uint min = re.ir[t.pc+3].raw;
287 if (t.counter < min)
288 {
289 t.counter += step;
290 t.pc -= len;
291 return true;
292 }
293 if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter)
294 {
295 debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s",
296 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
297 merge[re.ir[t.pc + 1].raw+t.counter] = genCounter;
298 }
299 else
300 {
301 debug(std_regex_matcher)
302 writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s",
303 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
304 return popState(e);
305 }
306 uint max = re.ir[t.pc+4].raw;
307 if (t.counter < max)
308 {
309 if (re.ir[t.pc].code == IR.RepeatEnd)
310 {
311 //queue out-of-loop thread
312 worklist.insertFront(fork(t, t.pc + IRL!(IR.RepeatEnd), t.counter % step));
313 t.counter += step;
314 t.pc -= len;
315 }
316 else
317 {
318 //queue into-loop thread
319 worklist.insertFront(fork(t, t.pc - len, t.counter + step));
320 t.counter %= step;
321 t.pc += IRL!(IR.RepeatEnd);
322 }
323 }
324 else
325 {
326 t.counter %= step;
327 t.pc += IRL!(IR.RepeatEnd);
328 }
329 return true;
330 }
331 }
332
333 static bool op(IR code)(E e, S* state)
334 if (code == IR.InfiniteEnd || code == IR.InfiniteQEnd)
335 {
with(state)336 with(e) with(state)
337 {
338 if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter)
339 {
340 debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s",
341 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
342 merge[re.ir[t.pc + 1].raw+t.counter] = genCounter;
343 }
344 else
345 {
346 debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s",
347 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
348 return popState(e);
349 }
350 uint len = re.ir[t.pc].data;
351 uint pc1, pc2; //branches to take in priority order
352 if (re.ir[t.pc].code == IR.InfiniteEnd)
353 {
354 pc1 = t.pc - len;
355 pc2 = t.pc + IRL!(IR.InfiniteEnd);
356 }
357 else
358 {
359 pc1 = t.pc + IRL!(IR.InfiniteEnd);
360 pc2 = t.pc - len;
361 }
362 worklist.insertFront(fork(t, pc2, t.counter));
363 t.pc = pc1;
364 return true;
365 }
366 }
367
368 static bool op(IR code)(E e, S* state)
369 if (code == IR.InfiniteBloomEnd)
370 {
with(state)371 with(e) with(state)
372 {
373 if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter)
374 {
375 debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s",
376 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
377 merge[re.ir[t.pc + 1].raw+t.counter] = genCounter;
378 }
379 else
380 {
381 debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s",
382 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
383 return popState(e);
384 }
385 uint len = re.ir[t.pc].data;
386 uint pc1, pc2; //branches to take in priority order
387 pc1 = t.pc - len;
388 pc2 = t.pc + IRL!(IR.InfiniteBloomEnd);
389 uint filterIndex = re.ir[t.pc + 2].raw;
390 if (re.filters[filterIndex][front])
391 worklist.insertFront(fork(t, pc2, t.counter));
392 t.pc = pc1;
393 return true;
394 }
395 }
396
397 static bool op(IR code:IR.OrEnd)(E e, S* state)
398 {
with(state)399 with(e) with(state)
400 {
401 if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter)
402 {
403 debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s",
404 t.pc, s[index .. s.lastIndex], genCounter, merge[re.ir[t.pc + 1].raw + t.counter] );
405 merge[re.ir[t.pc + 1].raw+t.counter] = genCounter;
406 t.pc += IRL!(IR.OrEnd);
407 }
408 else
409 {
410 debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s",
411 t.pc, s[index .. s.lastIndex], genCounter, merge[re.ir[t.pc + 1].raw + t.counter] );
412 return popState(e);
413 }
414 return true;
415 }
416 }
417
418 static bool op(IR code:IR.OrStart)(E e, S* state)
419 {
with(state)420 with(e) with(state)
421 {
422 t.pc += IRL!(IR.OrStart);
423 return op!(IR.Option)(e,state);
424 }
425 }
426
427 static bool op(IR code:IR.Option)(E e, S* state)
428 {
with(state)429 with(e) with(state)
430 {
431 uint next = t.pc + re.ir[t.pc].data + IRL!(IR.Option);
432 //queue next Option
433 if (re.ir[next].code == IR.Option)
434 {
435 worklist.insertFront(fork(t, next, t.counter));
436 }
437 t.pc += IRL!(IR.Option);
438 return true;
439 }
440 }
441
442 static bool op(IR code:IR.GotoEndOr)(E e, S* state)
443 {
with(state)444 with(e) with(state)
445 {
446 t.pc = t.pc + re.ir[t.pc].data + IRL!(IR.GotoEndOr);
447 return op!(IR.OrEnd)(e, state);
448 }
449 }
450
451 static bool op(IR code:IR.GroupStart)(E e, S* state)
452 {
with(state)453 with(e) with(state)
454 {
455 uint n = re.ir[t.pc].data;
456 t.matches.ptr[n].begin = index;
457 t.pc += IRL!(IR.GroupStart);
458 return true;
459 }
460 }
461 static bool op(IR code:IR.GroupEnd)(E e, S* state)
462 {
with(state)463 with(e) with(state)
464 {
465 uint n = re.ir[t.pc].data;
466 t.matches.ptr[n].end = index;
467 t.pc += IRL!(IR.GroupEnd);
468 return true;
469 }
470 }
471
472 static bool op(IR code:IR.Backref)(E e, S* state)
473 {
with(state)474 with(e) with(state)
475 {
476 uint n = re.ir[t.pc].data;
477 Group!DataIndex* source = re.ir[t.pc].localRef ? t.matches.ptr : backrefed.ptr;
478 assert(source);
479 if (source[n].begin == source[n].end)//zero-width Backref!
480 {
481 t.pc += IRL!(IR.Backref);
482 return true;
483 }
484 else
485 {
486 size_t idx = source[n].begin + t.uopCounter;
487 size_t end = source[n].end;
488 if (s[idx .. end].front == front)
489 {
490 import std.utf : stride;
491
492 t.uopCounter += stride(s[idx .. end], 0);
493 if (t.uopCounter + source[n].begin == source[n].end)
494 {//last codepoint
495 t.pc += IRL!(IR.Backref);
496 t.uopCounter = 0;
497 }
498 nlist.insertBack(t);
499 }
500 else
501 recycle(t);
502 t = worklist.fetch();
503 return t != null;
504 }
505 }
506 }
507
508
509 static bool op(IR code)(E e, S* state)
510 if (code == IR.LookbehindStart || code == IR.NeglookbehindStart)
511 {
with(state)512 with(e) with(state)
513 {
514 uint len = re.ir[t.pc].data;
515 uint ms = re.ir[t.pc + 1].raw, me = re.ir[t.pc + 2].raw;
516 uint end = t.pc + len + IRL!(IR.LookbehindEnd) + IRL!(IR.LookbehindStart);
517 bool positive = re.ir[t.pc].code == IR.LookbehindStart;
518 static if (Stream.isLoopback)
519 auto matcher = fwdMatcher(t.pc, end, me - ms, subCounters.get(t.pc, 0));
520 else
521 auto matcher = bwdMatcher(t.pc, end, me - ms, subCounters.get(t.pc, 0));
522 matcher.backrefed = backrefed.empty ? t.matches : backrefed;
523 //backMatch
524 auto mRes = matcher.matchOneShot(t.matches.ptr[ms .. me], IRL!(IR.LookbehindStart));
525 freelist = matcher.freelist;
526 subCounters[t.pc] = matcher.genCounter;
527 if ((mRes != 0 ) ^ positive)
528 {
529 return popState(e);
530 }
531 t.pc = end;
532 return true;
533 }
534 }
535
536 static bool op(IR code)(E e, S* state)
537 if (code == IR.LookaheadStart || code == IR.NeglookaheadStart)
538 {
with(state)539 with(e) with(state)
540 {
541 auto save = index;
542 uint len = re.ir[t.pc].data;
543 uint ms = re.ir[t.pc+1].raw, me = re.ir[t.pc+2].raw;
544 uint end = t.pc+len+IRL!(IR.LookaheadEnd)+IRL!(IR.LookaheadStart);
545 bool positive = re.ir[t.pc].code == IR.LookaheadStart;
546 static if (Stream.isLoopback)
547 auto matcher = bwdMatcher(t.pc, end, me - ms, subCounters.get(t.pc, 0));
548 else
549 auto matcher = fwdMatcher(t.pc, end, me - ms, subCounters.get(t.pc, 0));
550 matcher.backrefed = backrefed.empty ? t.matches : backrefed;
551 auto mRes = matcher.matchOneShot(t.matches.ptr[ms .. me], IRL!(IR.LookaheadStart));
552 freelist = matcher.freelist;
553 subCounters[t.pc] = matcher.genCounter;
554 s.reset(index);
555 next();
556 if ((mRes != 0) ^ positive)
557 {
558 return popState(e);
559 }
560 t.pc = end;
561 return true;
562 }
563 }
564
565 static bool op(IR code)(E e, S* state)
566 if (code == IR.LookaheadEnd || code == IR.NeglookaheadEnd ||
567 code == IR.LookbehindEnd || code == IR.NeglookbehindEnd)
568 {
with(state)569 with(e) with(state)
570 {
571 finish(t, matches.ptr[0 .. re.ngroup], re.ir[t.pc].data);
572 recycle(t);
573 //cut off low priority threads
574 recycle(clist);
575 recycle(worklist);
576 return false; // no more state
577 }
578 }
579
580 static bool op(IR code:IR.Nop)(E e, S* state)
581 {
582 with(state) t.pc += IRL!(IR.Nop);
583 return true;
584 }
585
586 static bool op(IR code:IR.OrChar)(E e, S* state)
587 {
with(state)588 with(e) with(state)
589 {
590 uint len = re.ir[t.pc].sequence;
591 uint end = t.pc + len;
592 static assert(IRL!(IR.OrChar) == 1);
593 for (; t.pc < end; t.pc++)
594 if (re.ir[t.pc].data == front)
595 break;
596 if (t.pc != end)
597 {
598 t.pc = end;
599 nlist.insertBack(t);
600 }
601 else
602 recycle(t);
603 t = worklist.fetch();
604 return t != null;
605 }
606 }
607
608 static bool op(IR code:IR.Char)(E e, S* state)
609 {
with(state)610 with(e) with(state)
611 {
612 if (front == re.ir[t.pc].data)
613 {
614 t.pc += IRL!(IR.Char);
615 nlist.insertBack(t);
616 }
617 else
618 recycle(t);
619 t = worklist.fetch();
620 return t != null;
621 }
622 }
623
624 static bool op(IR code:IR.Any)(E e, S* state)
625 {
with(state)626 with(e) with(state)
627 {
628 t.pc += IRL!(IR.Any);
629 nlist.insertBack(t);
630 t = worklist.fetch();
631 return t != null;
632 }
633 }
634
635 static bool op(IR code:IR.CodepointSet)(E e, S* state)
636 {
with(state)637 with(e) with(state)
638 {
639 if (re.charsets[re.ir[t.pc].data].scanFor(front))
640 {
641 t.pc += IRL!(IR.CodepointSet);
642 nlist.insertBack(t);
643 }
644 else
645 {
646 recycle(t);
647 }
648 t = worklist.fetch();
649 return t != null;
650 }
651 }
652
653 static bool op(IR code:IR.Trie)(E e, S* state)
654 {
with(state)655 with(e) with(state)
656 {
657 if (re.matchers[re.ir[t.pc].data][front])
658 {
659 t.pc += IRL!(IR.Trie);
660 nlist.insertBack(t);
661 }
662 else
663 {
664 recycle(t);
665 }
666 t = worklist.fetch();
667 return t != null;
668 }
669 }
670
671 }
672
673 template ThompsonOps(E,S, bool withInput:false)
674 {
675 @trusted:
676 // can't match these without input
677 static bool op(IR code)(E e, S* state)
678 if (code == IR.Char || code == IR.OrChar || code == IR.CodepointSet
679 || code == IR.Trie || code == IR.Char || code == IR.Any)
680 {
681 return state.popState(e);
682 }
683
684 // special case of zero-width backref
685 static bool op(IR code:IR.Backref)(E e, S* state)
686 {
with(state)687 with(e) with(state)
688 {
689 uint n = re.ir[t.pc].data;
690 Group!DataIndex* source = re.ir[t.pc].localRef ? t.matches.ptr : backrefed.ptr;
691 assert(source);
692 if (source[n].begin == source[n].end)//zero-width Backref!
693 {
694 t.pc += IRL!(IR.Backref);
695 return true;
696 }
697 else
698 return popState(e);
699 }
700 }
701
702 // forward all control flow to normal versions
703 static bool op(IR code)(E e, S* state)
704 if (code != IR.Char && code != IR.OrChar && code != IR.CodepointSet
705 && code != IR.Trie && code != IR.Char && code != IR.Any && code != IR.Backref)
706 {
707 return ThompsonOps!(E,S,true).op!code(e,state);
708 }
709 }
710
711 /+
712 Thomspon matcher does all matching in lockstep,
713 never looking at the same char twice
714 +/
715 @trusted class ThompsonMatcher(Char, StreamType = Input!Char): Matcher!Char
716 if (is(Char : dchar))
717 {
718 alias DataIndex = Stream.DataIndex;
719 alias Stream = StreamType;
720 alias OpFunc = bool function(ThompsonMatcher, State*) pure;
721 alias BackMatcher = ThompsonMatcher!(Char, BackLooper!(Stream));
722 alias OpBackFunc = bool function(BackMatcher, BackMatcher.State*) pure;
723 Thread!DataIndex* freelist;
724 ThreadList!DataIndex clist, nlist;
725 DataIndex[] merge;
726 Group!DataIndex[] backrefed;
727 const Regex!Char re; //regex program
728 Stream s;
729 dchar front;
730 DataIndex index;
731 DataIndex genCounter; //merge trace counter, goes up on every dchar
732 size_t[size_t] subCounters; //a table of gen counter per sub-engine: PC -> counter
733 OpFunc[] opCacheTrue; // pointers to Op!(IR.xyz) for each bytecode
734 OpFunc[] opCacheFalse; // ditto
735 OpBackFunc[] opCacheBackTrue; // ditto
736 OpBackFunc[] opCacheBackFalse; // ditto
737 size_t threadSize;
738 size_t _refCount;
739 int matched;
740 bool exhausted;
741
742 final:
743 pure
744 static struct State
745 {
746 Thread!DataIndex* t;
747 ThreadList!DataIndex worklist;
748 Group!DataIndex[] matches;
749
popStateState750 bool popState(E)(E e)
751 {
752 with(e)
753 {
754 recycle(t);
755 t = worklist.fetch();
756 return t != null;
757 }
758 }
759
760 }
761
762 static if (__traits(hasMember,Stream, "search"))
763 {
764 enum kicked = true;
765 }
766 else
767 enum kicked = false;
768
769 static size_t getThreadSize(const ref Regex!Char re)
770 {
771 return re.ngroup
772 ? (Thread!DataIndex).sizeof + (re.ngroup-1)*(Group!DataIndex).sizeof
773 : (Thread!DataIndex).sizeof - (Group!DataIndex).sizeof;
774 }
775
776 static size_t initialMemory(const ref Regex!Char re)
777 {
778 return getThreadSize(re)*re.threadCount + re.hotspotTableSize*size_t.sizeof
779 +4*OpFunc.sizeof*re.ir.length;
780 }
781
782 //true if it's start of input
atStart()783 @property bool atStart(){ return index == 0; }
784
785 //true if it's end of input
atEnd()786 @property bool atEnd(){ return index == s.lastIndex && s.atEnd; }
787
refCount()788 override @property ref size_t refCount() @safe { return _refCount; }
789
pattern()790 override @property ref const(Regex!Char) pattern() @safe { return re; }
791
next()792 bool next()
793 {
794 if (!s.nextChar(front, index))
795 {
796 index = s.lastIndex;
797 return false;
798 }
799 return true;
800 }
801
802 static if (kicked)
803 {
search()804 bool search()
805 {
806
807 if (!s.search(re.kickstart, front, index))
808 {
809 index = s.lastIndex;
810 return false;
811 }
812 return true;
813 }
814 }
815
initExternalMemory(void[]memory)816 void initExternalMemory(void[] memory)
817 {
818 threadSize = getThreadSize(re);
819 prepareFreeList(re.threadCount, memory);
820 if (re.hotspotTableSize)
821 {
822 merge = arrayInChunk!(DataIndex)(re.hotspotTableSize, memory);
823 merge[] = 0;
824 }
825 opCacheTrue = arrayInChunk!(OpFunc)(re.ir.length, memory);
826 opCacheFalse = arrayInChunk!(OpFunc)(re.ir.length, memory);
827 opCacheBackTrue = arrayInChunk!(OpBackFunc)(re.ir.length, memory);
828 opCacheBackFalse = arrayInChunk!(OpBackFunc)(re.ir.length, memory);
829
830 for (uint pc = 0; pc<re.ir.length; pc += re.ir[pc].length)
831 {
832 L_dispatch:
833 switch (re.ir[pc].code)
834 {
835 foreach (e; __traits(allMembers, IR))
836 {
837 mixin(`case IR.`~e~`:
838 opCacheTrue[pc] = &Ops!(true).op!(IR.`~e~`);
839 opCacheBackTrue[pc] = &BackOps!(true).op!(IR.`~e~`);
840 opCacheFalse[pc] = &Ops!(false).op!(IR.`~e~`);
841 opCacheBackFalse[pc] = &BackOps!(false).op!(IR.`~e~`);
842 break L_dispatch;
843 `);
844 }
845 default:
846 assert(0, "Unrecognized instruction "~re.ir[pc].mnemonic);
847 }
848 }
849 }
850
851 override Matcher!Char rearm(in Char[] data)
852 {
853 exhausted = false;
854 matched = 0;
855 s = Stream(data);
856 return this;
857 }
858
this()859 this()(ref const Regex!Char program, Stream stream, void[] memory)
860 {
861 // We are emplace'd to malloced memory w/o blitting T.init over it\
862 // make sure we initialize all fields explicitly
863 _refCount = 1;
864 subCounters = null;
865 backrefed = null;
866 exhausted = false;
867 matched = 0;
868 re = program;
869 s = stream;
870 initExternalMemory(memory);
871 genCounter = 0;
872 }
873
this(ThompsonMatcher matcher,size_t lo,size_t hi,uint nGroup,Stream stream)874 this(ThompsonMatcher matcher, size_t lo, size_t hi, uint nGroup, Stream stream)
875 {
876 _refCount = 1;
877 subCounters = matcher.subCounters;
878 s = stream;
879 auto code = matcher.re.ir[lo .. hi];
880 re = matcher.re.withCode(code).withNGroup(nGroup);
881 threadSize = matcher.threadSize;
882 merge = matcher.merge;
883 freelist = matcher.freelist;
884 opCacheTrue = matcher.opCacheTrue[lo .. hi];
885 opCacheBackTrue = matcher.opCacheBackTrue[lo .. hi];
886 opCacheFalse = matcher.opCacheFalse[lo .. hi];
887 opCacheBackFalse = matcher.opCacheBackFalse[lo .. hi];
888 front = matcher.front;
889 index = matcher.index;
890 }
891
this(BackMatcher matcher,size_t lo,size_t hi,uint nGroup,Stream stream)892 this(BackMatcher matcher, size_t lo, size_t hi, uint nGroup, Stream stream)
893 {
894 _refCount = 1;
895 subCounters = matcher.subCounters;
896 s = stream;
897 auto code = matcher.re.ir[lo .. hi];
898 re = matcher.re.withCode(code).withNGroup(nGroup);
899 threadSize = matcher.threadSize;
900 merge = matcher.merge;
901 freelist = matcher.freelist;
902 opCacheTrue = matcher.opCacheBackTrue[lo .. hi];
903 opCacheBackTrue = matcher.opCacheTrue[lo .. hi];
904 opCacheFalse = matcher.opCacheBackFalse[lo .. hi];
905 opCacheBackFalse = matcher.opCacheFalse[lo .. hi];
906 front = matcher.front;
907 index = matcher.index;
908 }
909
fwdMatcher()910 auto fwdMatcher()(size_t lo, size_t hi, uint nGroup, size_t counter)
911 {
912 auto m = new ThompsonMatcher!(Char, Stream)(this, lo, hi, nGroup, s);
913 m.genCounter = counter;
914 return m;
915 }
916
bwdMatcher()917 auto bwdMatcher()(size_t lo, size_t hi, uint nGroup, size_t counter)
918 {
919 alias BackLooper = typeof(s.loopBack(index));
920 auto m = new ThompsonMatcher!(Char, BackLooper)(this, lo, hi, nGroup, s.loopBack(index));
921 m.genCounter = counter;
922 m.next();
923 return m;
924 }
925
926 override void dupTo(Matcher!Char engine, void[] memory)
927 {
928 auto thompson = cast(ThompsonMatcher) engine;
929 thompson.s = s;
930 thompson.subCounters = null;
931 thompson.front = front;
932 thompson.index = index;
933 thompson.matched = matched;
934 thompson.exhausted = exhausted;
935 thompson.initExternalMemory(memory);
936 }
937
938 override int match(Group!DataIndex[] matches)
939 {
940 debug(std_regex_matcher)
941 writeln("------------------------------------------");
942 if (exhausted)
943 {
944 return false;
945 }
946 if (re.flags & RegexInfo.oneShot)
947 {
948 next();
949 exhausted = true;
950 return matchOneShot(matches);
951 }
952 static if (kicked)
953 if (!re.kickstart.empty)
954 return matchImpl!(true)(matches);
955 return matchImpl!(false)(matches);
956 }
957
958 //match the input and fill matches
matchImpl(bool withSearch)959 int matchImpl(bool withSearch)(Group!DataIndex[] matches)
960 {
961 if (!matched && clist.empty)
962 {
963 static if (withSearch)
964 search();
965 else
966 next();
967 }
968 else//char in question is fetched in prev call to match
969 {
970 matched = 0;
971 }
972 State state;
973 state.matches = matches;
974
975 if (!atEnd)//if no char
976 for (;;)
977 {
978 genCounter++;
979 debug(std_regex_matcher)
980 {
981 writefln("Threaded matching threads at %s", s[index .. s.lastIndex]);
982 foreach (t; clist[])
983 {
984 assert(t);
985 writef("pc=%s ",t.pc);
986 write(t.matches);
987 writeln();
988 }
989 }
990 for (state.t = clist.fetch(); state.t; state.t = clist.fetch())
991 {
992 eval!true(&state);
993 }
994 //if we already have match no need to push the engine
995 if (!matched)
996 {
997 state.t = createStart(index);
998 eval!true(&state);//new thread staring at this position
999 }
1000 else if (nlist.empty)
1001 {
1002 debug(std_regex_matcher) writeln("Stopped matching before consuming full input");
1003 break;//not a partial match for sure
1004 }
1005 clist = nlist;
1006 nlist = (ThreadList!DataIndex).init;
1007 if (clist.tip is null)
1008 {
1009 static if (withSearch)
1010 {
1011 if (!search())
1012 break;
1013 }
1014 else
1015 {
1016 if (!next())
1017 break;
1018 }
1019 }
1020 else if (!next())
1021 {
1022 if (!atEnd) return false;
1023 exhausted = true;
1024 break;
1025 }
1026 }
1027
1028 genCounter++; //increment also on each end
1029 debug(std_regex_matcher) writefln("Threaded matching threads at end");
1030 //try out all zero-width posibilities
1031 for (state.t = clist.fetch(); state.t; state.t = clist.fetch())
1032 {
1033 eval!false(&state);
1034 }
1035 if (!matched)
1036 {
1037 state.t = createStart(index);
1038 eval!false(&state);//new thread starting at end of input
1039 }
1040 if (matched)
1041 {//in case NFA found match along the way
1042 //and last possible longer alternative ultimately failed
1043 s.reset(matches[0].end);//reset to last successful match
1044 next();//and reload front character
1045 //--- here the exact state of stream was restored ---
1046 exhausted = atEnd || !(re.flags & RegexOption.global);
1047 //+ empty match advances the input
1048 if (!exhausted && matches[0].begin == matches[0].end)
1049 next();
1050 }
1051 return matched;
1052 }
1053
1054 /+
1055 handle succesful threads
1056 +/
1057 void finish(const(Thread!DataIndex)* t, Group!DataIndex[] matches, int code)
1058 {
1059 matches.ptr[0 .. re.ngroup] = t.matches.ptr[0 .. re.ngroup];
debug(std_regex_matcher)1060 debug(std_regex_matcher)
1061 {
1062 writef("FOUND pc=%s prog_len=%s",
1063 t.pc, re.ir.length);
1064 if (!matches.empty)
1065 writefln(": %s..%s", matches[0].begin, matches[0].end);
1066 foreach (v; matches)
1067 writefln("%d .. %d", v.begin, v.end);
1068 }
1069 matched = code;
1070 }
1071
1072 alias Ops(bool withInput) = ThompsonOps!(ThompsonMatcher, State, withInput);
1073 alias BackOps(bool withInput) = ThompsonOps!(BackMatcher, BackMatcher.State, withInput);
1074
1075 /+
1076 match thread against codepoint, cutting trough all 0-width instructions
1077 and taking care of control flow, then add it to nlist
1078 +/
eval(bool withInput)1079 void eval(bool withInput)(State* state)
1080 {
1081 debug(std_regex_matcher) writeln("---- Evaluating thread");
1082 static if (withInput)
1083 while (opCacheTrue.ptr[state.t.pc](this, state)){}
1084 else
1085 while (opCacheFalse.ptr[state.t.pc](this, state)){}
1086 }
1087 enum uint RestartPc = uint.max;
1088 //match the input, evaluating IR without searching
1089 int matchOneShot(Group!DataIndex[] matches, uint startPc = 0)
1090 {
debug(std_regex_matcher)1091 debug(std_regex_matcher)
1092 {
1093 writefln("---------------single shot match ----------------- ");
1094 }
1095 alias evalFn = eval;
1096 assert(clist == (ThreadList!DataIndex).init || startPc == RestartPc); // incorrect after a partial match
1097 assert(nlist == (ThreadList!DataIndex).init || startPc == RestartPc);
1098 State state;
1099 state.matches = matches;
1100 if (!atEnd)//if no char
1101 {
debug(std_regex_matcher)1102 debug(std_regex_matcher)
1103 {
1104 writefln("-- Threaded matching threads at %s", s[index .. s.lastIndex]);
1105 }
1106 if (startPc != RestartPc)
1107 {
1108 state.t = createStart(index, startPc);
1109 genCounter++;
1110 evalFn!true(&state);
1111 }
1112 for (;;)
1113 {
1114 debug(std_regex_matcher) writeln("\n-- Started iteration of main cycle");
1115 genCounter++;
debug(std_regex_matcher)1116 debug(std_regex_matcher)
1117 {
1118 foreach (t; clist[])
1119 {
1120 assert(t);
1121 }
1122 }
1123 for (state.t = clist.fetch(); state.t; state.t = clist.fetch())
1124 {
1125 evalFn!true(&state);
1126 }
1127 if (nlist.empty)
1128 {
1129 debug(std_regex_matcher) writeln("Stopped matching before consuming full input");
1130 break;//not a partial match for sure
1131 }
1132 clist = nlist;
1133 nlist = (ThreadList!DataIndex).init;
1134 if (!next())
1135 break;
1136 debug(std_regex_matcher) writeln("-- Ended iteration of main cycle\n");
1137 }
1138 }
1139 genCounter++; //increment also on each end
1140 debug(std_regex_matcher) writefln("-- Matching threads at end");
1141 //try out all zero-width posibilities
1142 for (state.t = clist.fetch(); state.t; state.t = clist.fetch())
1143 {
1144 evalFn!false(&state);
1145 }
1146 if (!matched)
1147 {
1148 state.t = createStart(index, startPc);
1149 evalFn!false(&state);
1150 }
1151 return matched;
1152 }
1153
1154 //get a dirty recycled Thread
1155 Thread!DataIndex* allocate()
1156 {
1157 assert(freelist, "not enough preallocated memory");
1158 Thread!DataIndex* t = freelist;
1159 freelist = freelist.next;
1160 return t;
1161 }
1162
1163 //link memory into a free list of Threads
prepareFreeList(size_t size,ref void[]memory)1164 void prepareFreeList(size_t size, ref void[] memory)
1165 {
1166 void[] mem = memory[0 .. threadSize*size];
1167 memory = memory[threadSize * size .. $];
1168 freelist = cast(Thread!DataIndex*)&mem[0];
1169 size_t i;
1170 for (i = threadSize; i < threadSize*size; i += threadSize)
1171 (cast(Thread!DataIndex*)&mem[i-threadSize]).next = cast(Thread!DataIndex*)&mem[i];
1172 (cast(Thread!DataIndex*)&mem[i-threadSize]).next = null;
1173 }
1174
1175 //dispose a thread
1176 void recycle(Thread!DataIndex* t)
1177 {
1178 t.next = freelist;
1179 freelist = t;
1180 }
1181
1182 //dispose list of threads
1183 void recycle(ref ThreadList!DataIndex list)
1184 {
1185 if (list.tip)
1186 {
1187 // just put this head-tail list in front of freelist
1188 list.toe.next = freelist;
1189 freelist = list.tip;
1190 list = list.init;
1191 }
1192 }
1193
1194 //creates a copy of master thread with given pc
1195 Thread!DataIndex* fork(Thread!DataIndex* master, uint pc, uint counter)
1196 {
1197 auto t = allocate();
1198 t.matches.ptr[0 .. re.ngroup] = master.matches.ptr[0 .. re.ngroup];
1199 t.pc = pc;
1200 t.counter = counter;
1201 t.uopCounter = 0;
1202 return t;
1203 }
1204
1205 //creates a start thread
1206 Thread!DataIndex* createStart(DataIndex index, uint pc = 0)
1207 {
1208 auto t = allocate();
1209 t.matches.ptr[0 .. re.ngroup] = (Group!DataIndex).init;
1210 t.matches[0].begin = index;
1211 t.pc = pc;
1212 t.counter = 0;
1213 t.uopCounter = 0;
1214 return t;
1215 }
1216 }
1217