xref: /netbsd-src/external/gpl3/gcc.old/dist/libphobos/src/std/regex/internal/thompson.d (revision 627f7eb200a4419d89b531d55fccd2ee3ffdcde0)
1 //Written in the D programming language
2 /*
3     Implementation of Thompson NFA std.regex engine.
4     Key point is evaluation of all possible threads (state) at each step
5     in a breadth-first manner, thereby geting some nice properties:
6         - looking at each character only once
7         - merging of equivalent threads, that gives matching process linear time complexity
8 */
9 module std.regex.internal.thompson;
10 
11 package(std.regex):
12 
13 import std.range.primitives;
14 import std.regex.internal.ir;
15 
16 //State of VM thread
Thread(DataIndex)17 struct Thread(DataIndex)
18 {
19     Thread* next;    //intrusive linked list
20     uint pc;
21     uint counter;    //loop counter
22     uint uopCounter; //counts micro operations inside one macro instruction (e.g. BackRef)
23     Group!DataIndex[1] matches;
24 }
25 
26 //head-tail singly-linked list
ThreadList(DataIndex)27 struct ThreadList(DataIndex)
28 {
29     Thread!DataIndex* tip = null, toe = null;
30     //add new thread to the start of list
31     void insertFront(Thread!DataIndex* t)
32     {
33         if (tip)
34         {
35             t.next = tip;
36             tip = t;
37         }
38         else
39         {
40             t.next = null;
41             tip = toe = t;
42         }
43     }
44     //add new thread to the end of list
45     void insertBack(Thread!DataIndex* t)
46     {
47         if (toe)
48         {
49             toe.next = t;
50             toe = t;
51         }
52         else
53             tip = toe = t;
54         toe.next = null;
55     }
56     //move head element out of list
57     Thread!DataIndex* fetch()
58     {
59         auto t = tip;
60         if (tip == toe)
61             tip = toe = null;
62         else
63             tip = tip.next;
64         return t;
65     }
66     //non-destructive iteration of ThreadList
67     struct ThreadRange
68     {
69         const(Thread!DataIndex)* ct;
70         this(ThreadList tlist){ ct = tlist.tip; }
71         @property bool empty(){ return ct is null; }
72         @property const(Thread!DataIndex)* front(){ return ct; }
73         @property popFront()
74         {
75             assert(ct);
76             ct = ct.next;
77         }
78     }
79     @property bool empty()
80     {
81         return tip == null;
82     }
83     ThreadRange opSlice()
84     {
85         return ThreadRange(this);
86     }
87 }
88 
89 template ThompsonOps(E, S, bool withInput:true)
90 {
91 @trusted:
92     static bool op(IR code:IR.End)(E* e, S* state)
93     {
with(state)94         with(e) with(state)
95         {
96             finish(t, matches, re.ir[t.pc].data);
97             //fix endpoint of the whole match
98             matches[0].end = index;
99             recycle(t);
100             //cut off low priority threads
101             recycle(clist);
102             recycle(worklist);
103             debug(std_regex_matcher) writeln("Finished thread ", matches);
104             return false; // no more state to eval
105         }
106     }
107 
108     static bool op(IR code:IR.Wordboundary)(E* e, S* state)
109     {
with(state)110         with(e) with(state)
111         {
112             dchar back;
113             DataIndex bi;
114             //at start & end of input
115             if (atStart && wordMatcher[front])
116             {
117                 t.pc += IRL!(IR.Wordboundary);
118                 return true;
119             }
120             else if (atEnd && s.loopBack(index).nextChar(back, bi)
121                     && wordMatcher[back])
122             {
123                 t.pc += IRL!(IR.Wordboundary);
124                 return true;
125             }
126             else if (s.loopBack(index).nextChar(back, bi))
127             {
128                 bool af = wordMatcher[front];
129                 bool ab = wordMatcher[back];
130                 if (af ^ ab)
131                 {
132                     t.pc += IRL!(IR.Wordboundary);
133                     return true;
134                 }
135             }
136             return popState(e);
137         }
138     }
139 
140     static bool op(IR code:IR.Notwordboundary)(E* e, S* state)
141     {
with(state)142         with(e) with(state)
143         {
144             dchar back;
145             DataIndex bi;
146             //at start & end of input
147             if (atStart && wordMatcher[front])
148             {
149                 return popState(e);
150             }
151             else if (atEnd && s.loopBack(index).nextChar(back, bi)
152                     && wordMatcher[back])
153             {
154                 return popState(e);
155             }
156             else if (s.loopBack(index).nextChar(back, bi))
157             {
158                 bool af = wordMatcher[front];
159                 bool ab = wordMatcher[back]  != 0;
160                 if (af ^ ab)
161                 {
162                     return popState(e);
163                 }
164             }
165             t.pc += IRL!(IR.Notwordboundary);
166         }
167         return true;
168     }
169 
170     static bool op(IR code:IR.Bof)(E* e, S* state)
171     {
with(state)172         with(e) with(state)
173         {
174             if (atStart)
175             {
176                 t.pc += IRL!(IR.Bof);
177                 return true;
178             }
179             else
180             {
181                 return popState(e);
182             }
183         }
184     }
185 
186     static bool op(IR code:IR.Bol)(E* e, S* state)
187     {
with(state)188         with(e) with(state)
189         {
190             dchar back;
191             DataIndex bi;
192             if (atStart
193                 ||(s.loopBack(index).nextChar(back,bi)
194                 && startOfLine(back, front == '\n')))
195             {
196                 t.pc += IRL!(IR.Bol);
197                 return true;
198             }
199             else
200             {
201                 return popState(e);
202             }
203         }
204     }
205 
206     static bool op(IR code:IR.Eof)(E* e, S* state)
207     {
with(state)208         with(e) with(state)
209         {
210             if (atEnd)
211             {
212                 t.pc += IRL!(IR.Eol);
213                 return true;
214             }
215             else
216             {
217                 return popState(e);
218             }
219         }
220     }
221 
222     static bool op(IR code:IR.Eol)(E* e, S* state)
223     {
with(state)224         with(e) with(state)
225         {
226             dchar back;
227             DataIndex bi;
228             //no matching inside \r\n
229             if (atEnd || (endOfLine(front, s.loopBack(index).nextChar(back, bi)
230                     && back == '\r')))
231             {
232                 t.pc += IRL!(IR.Eol);
233                 return true;
234             }
235             else
236             {
237                 return popState(e);
238             }
239 
240         }
241     }
242 
243     static bool op(IR code:IR.InfiniteStart)(E* e, S* state)
244     {
245         with(e) with(state)
246             t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteStart);
247         return op!(IR.InfiniteEnd)(e,state);
248     }
249 
250     static bool op(IR code:IR.InfiniteBloomStart)(E* e, S* state)
251     {
252         with(e) with(state)
253             t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteBloomStart);
254         return op!(IR.InfiniteBloomEnd)(e,state);
255     }
256 
257     static bool op(IR code:IR.InfiniteQStart)(E* e, S* state)
258     {
259         with(e) with(state)
260             t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteQStart);
261         return op!(IR.InfiniteQEnd)(e,state);
262     }
263 
264     static bool op(IR code:IR.RepeatStart)(E* e, S* state)
265     {
266         with(e) with(state)
267             t.pc += re.ir[t.pc].data + IRL!(IR.RepeatStart);
268         return op!(IR.RepeatEnd)(e,state);
269     }
270 
271     static bool op(IR code:IR.RepeatQStart)(E* e, S* state)
272     {
273         with(e) with(state)
274             t.pc += re.ir[t.pc].data + IRL!(IR.RepeatQStart);
275         return op!(IR.RepeatQEnd)(e,state);
276     }
277 
278     static bool op(IR code)(E* e, S* state)
279         if (code == IR.RepeatEnd || code == IR.RepeatQEnd)
280     {
with(state)281         with(e) with(state)
282         {
283             //len, step, min, max
284                 uint len = re.ir[t.pc].data;
285                 uint step =  re.ir[t.pc+2].raw;
286                 uint min = re.ir[t.pc+3].raw;
287                 if (t.counter < min)
288                 {
289                     t.counter += step;
290                     t.pc -= len;
291                     return true;
292                 }
293                 if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter)
294                 {
295                     debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s",
296                                     t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
297                     merge[re.ir[t.pc + 1].raw+t.counter] = genCounter;
298                 }
299                 else
300                 {
301                     debug(std_regex_matcher)
302                         writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s",
303                             t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
304                     return popState(e);
305                 }
306                 uint max = re.ir[t.pc+4].raw;
307                 if (t.counter < max)
308                 {
309                     if (re.ir[t.pc].code == IR.RepeatEnd)
310                     {
311                         //queue out-of-loop thread
312                         worklist.insertFront(fork(t, t.pc + IRL!(IR.RepeatEnd),  t.counter % step));
313                         t.counter += step;
314                         t.pc -= len;
315                     }
316                     else
317                     {
318                         //queue into-loop thread
319                         worklist.insertFront(fork(t, t.pc - len,  t.counter + step));
320                         t.counter %= step;
321                         t.pc += IRL!(IR.RepeatEnd);
322                     }
323                 }
324                 else
325                 {
326                     t.counter %= step;
327                     t.pc += IRL!(IR.RepeatEnd);
328                 }
329                 return true;
330         }
331     }
332 
333     static bool op(IR code)(E* e, S* state)
334         if (code == IR.InfiniteEnd || code == IR.InfiniteQEnd)
335     {
with(state)336         with(e) with(state)
337         {
338             if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter)
339             {
340                 debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s",
341                                 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
342                 merge[re.ir[t.pc + 1].raw+t.counter] = genCounter;
343             }
344             else
345             {
346                 debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s",
347                                 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
348                 return popState(e);
349             }
350             uint len = re.ir[t.pc].data;
351             uint pc1, pc2; //branches to take in priority order
352             if (re.ir[t.pc].code == IR.InfiniteEnd)
353             {
354                 pc1 = t.pc - len;
355                 pc2 = t.pc + IRL!(IR.InfiniteEnd);
356             }
357             else
358             {
359                 pc1 = t.pc + IRL!(IR.InfiniteEnd);
360                 pc2 = t.pc - len;
361             }
362             worklist.insertFront(fork(t, pc2, t.counter));
363             t.pc = pc1;
364             return true;
365         }
366     }
367 
368     static bool op(IR code)(E* e, S* state)
369         if (code == IR.InfiniteBloomEnd)
370     {
with(state)371         with(e) with(state)
372         {
373             if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter)
374             {
375                 debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s",
376                                 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
377                 merge[re.ir[t.pc + 1].raw+t.counter] = genCounter;
378             }
379             else
380             {
381                 debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s",
382                                 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
383                 return popState(e);
384             }
385             uint len = re.ir[t.pc].data;
386             uint pc1, pc2; //branches to take in priority order
387             pc1 = t.pc - len;
388             pc2 = t.pc + IRL!(IR.InfiniteBloomEnd);
389             uint filterIndex = re.ir[t.pc + 2].raw;
390             if (re.filters[filterIndex][front])
391                 worklist.insertFront(fork(t, pc2, t.counter));
392             t.pc = pc1;
393             return true;
394         }
395     }
396 
397     static bool op(IR code:IR.OrEnd)(E* e, S* state)
398     {
with(state)399         with(e) with(state)
400         {
401             if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter)
402             {
403                 debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s",
404                                 t.pc, s[index .. s.lastIndex], genCounter, merge[re.ir[t.pc + 1].raw + t.counter] );
405                 merge[re.ir[t.pc + 1].raw+t.counter] = genCounter;
406                 t.pc += IRL!(IR.OrEnd);
407             }
408             else
409             {
410                 debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s",
411                                 t.pc, s[index .. s.lastIndex], genCounter, merge[re.ir[t.pc + 1].raw + t.counter] );
412                 return popState(e);
413             }
414             return true;
415         }
416     }
417 
418     static bool op(IR code:IR.OrStart)(E* e, S* state)
419     {
with(state)420         with(e) with(state)
421         {
422             t.pc += IRL!(IR.OrStart);
423             return op!(IR.Option)(e,state);
424         }
425     }
426 
427     static bool op(IR code:IR.Option)(E* e, S* state)
428     {
with(state)429         with(e) with(state)
430         {
431             uint next = t.pc + re.ir[t.pc].data + IRL!(IR.Option);
432             //queue next Option
433             if (re.ir[next].code == IR.Option)
434             {
435                 worklist.insertFront(fork(t, next, t.counter));
436             }
437             t.pc += IRL!(IR.Option);
438             return true;
439         }
440     }
441 
442     static bool op(IR code:IR.GotoEndOr)(E* e, S* state)
443     {
with(state)444         with(e) with(state)
445         {
446             t.pc = t.pc + re.ir[t.pc].data + IRL!(IR.GotoEndOr);
447             return op!(IR.OrEnd)(e, state);
448         }
449     }
450 
451     static bool op(IR code:IR.GroupStart)(E* e, S* state)
452     {
with(state)453         with(e) with(state)
454         {
455             uint n = re.ir[t.pc].data;
456             t.matches.ptr[n].begin = index;
457             t.pc += IRL!(IR.GroupStart);
458             return true;
459         }
460     }
461     static bool op(IR code:IR.GroupEnd)(E* e, S* state)
462     {
with(state)463         with(e) with(state)
464         {
465             uint n = re.ir[t.pc].data;
466             t.matches.ptr[n].end = index;
467             t.pc += IRL!(IR.GroupEnd);
468             return true;
469         }
470     }
471 
472     static bool op(IR code:IR.Backref)(E* e, S* state)
473     {
with(state)474         with(e) with(state)
475         {
476             uint n = re.ir[t.pc].data;
477             Group!DataIndex* source = re.ir[t.pc].localRef ? t.matches.ptr : backrefed.ptr;
478             assert(source);
479             if (source[n].begin == source[n].end)//zero-width Backref!
480             {
481                 t.pc += IRL!(IR.Backref);
482                 return true;
483             }
484             else
485             {
486                 size_t idx = source[n].begin + t.uopCounter;
487                 size_t end = source[n].end;
488                 if (s[idx .. end].front == front)
489                 {
490                     import std.utf : stride;
491 
492                     t.uopCounter += stride(s[idx .. end], 0);
493                     if (t.uopCounter + source[n].begin == source[n].end)
494                     {//last codepoint
495                         t.pc += IRL!(IR.Backref);
496                         t.uopCounter = 0;
497                     }
498                     nlist.insertBack(t);
499                 }
500                 else
501                     recycle(t);
502                 t = worklist.fetch();
503                 return t != null;
504             }
505         }
506     }
507 
508 
509     static bool op(IR code)(E* e, S* state)
510         if (code == IR.LookbehindStart || code == IR.NeglookbehindStart)
511     {
with(state)512         with(e) with(state)
513         {
514             uint len = re.ir[t.pc].data;
515             uint ms = re.ir[t.pc + 1].raw, me = re.ir[t.pc + 2].raw;
516             uint end = t.pc + len + IRL!(IR.LookbehindEnd) + IRL!(IR.LookbehindStart);
517             bool positive = re.ir[t.pc].code == IR.LookbehindStart;
518             static if (Stream.isLoopback)
519                 auto matcher = fwdMatcher(t.pc, end, subCounters.get(t.pc, 0));
520             else
521                 auto matcher = bwdMatcher(t.pc, end, subCounters.get(t.pc, 0));
522             matcher.re.ngroup = me - ms;
523             matcher.backrefed = backrefed.empty ? t.matches : backrefed;
524             //backMatch
525             auto mRes = matcher.matchOneShot(t.matches.ptr[ms .. me], IRL!(IR.LookbehindStart));
526             freelist = matcher.freelist;
527             subCounters[t.pc] = matcher.genCounter;
528             if ((mRes != 0 ) ^ positive)
529             {
530                 return popState(e);
531             }
532             t.pc = end;
533             return true;
534         }
535     }
536 
537     static bool op(IR code)(E* e, S* state)
538         if (code == IR.LookaheadStart || code == IR.NeglookaheadStart)
539     {
with(state)540         with(e) with(state)
541         {
542             auto save = index;
543             uint len = re.ir[t.pc].data;
544             uint ms = re.ir[t.pc+1].raw, me = re.ir[t.pc+2].raw;
545             uint end = t.pc+len+IRL!(IR.LookaheadEnd)+IRL!(IR.LookaheadStart);
546             bool positive = re.ir[t.pc].code == IR.LookaheadStart;
547             static if (Stream.isLoopback)
548                 auto matcher = bwdMatcher(t.pc, end, subCounters.get(t.pc, 0));
549             else
550                 auto matcher = fwdMatcher(t.pc, end, subCounters.get(t.pc, 0));
551             matcher.re.ngroup = me - ms;
552             matcher.backrefed = backrefed.empty ? t.matches : backrefed;
553             auto mRes = matcher.matchOneShot(t.matches.ptr[ms .. me], IRL!(IR.LookaheadStart));
554             freelist = matcher.freelist;
555             subCounters[t.pc] = matcher.genCounter;
556             s.reset(index);
557             next();
558             if ((mRes != 0) ^ positive)
559             {
560                 return popState(e);
561             }
562             t.pc = end;
563             return true;
564         }
565     }
566 
567     static bool op(IR code)(E* e, S* state)
568         if (code == IR.LookaheadEnd || code == IR.NeglookaheadEnd ||
569             code == IR.LookbehindEnd || code == IR.NeglookbehindEnd)
570     {
with(state)571         with(e) with(state)
572         {
573                 finish(t, matches.ptr[0 .. re.ngroup], re.ir[t.pc].data);
574                 recycle(t);
575                 //cut off low priority threads
576                 recycle(clist);
577                 recycle(worklist);
578                 return false; // no more state
579         }
580     }
581 
582     static bool op(IR code:IR.Nop)(E* e, S* state)
583     {
584         with(state) t.pc += IRL!(IR.Nop);
585         return true;
586     }
587 
588     static bool op(IR code:IR.OrChar)(E* e, S* state)
589     {
with(state)590         with(e) with(state)
591         {
592             uint len = re.ir[t.pc].sequence;
593             uint end = t.pc + len;
594             static assert(IRL!(IR.OrChar) == 1);
595             for (; t.pc < end; t.pc++)
596                 if (re.ir[t.pc].data == front)
597                     break;
598             if (t.pc != end)
599             {
600                 t.pc = end;
601                 nlist.insertBack(t);
602             }
603             else
604                 recycle(t);
605             t = worklist.fetch();
606             return t != null;
607         }
608     }
609 
610     static bool op(IR code:IR.Char)(E* e, S* state)
611     {
with(state)612         with(e) with(state)
613         {
614             if (front == re.ir[t.pc].data)
615             {
616                 t.pc += IRL!(IR.Char);
617                 nlist.insertBack(t);
618             }
619             else
620                 recycle(t);
621             t = worklist.fetch();
622             return t != null;
623         }
624     }
625 
626     static bool op(IR code:IR.Any)(E* e, S* state)
627     {
with(state)628         with(e) with(state)
629         {
630             t.pc += IRL!(IR.Any);
631             nlist.insertBack(t);
632             t = worklist.fetch();
633             return t != null;
634         }
635     }
636 
637     static bool op(IR code:IR.CodepointSet)(E* e, S* state)
638     {
with(state)639         with(e) with(state)
640         {
641             if (re.charsets[re.ir[t.pc].data].scanFor(front))
642             {
643                 t.pc += IRL!(IR.CodepointSet);
644                 nlist.insertBack(t);
645             }
646             else
647             {
648                 recycle(t);
649             }
650             t = worklist.fetch();
651             return t != null;
652         }
653     }
654 
655     static bool op(IR code:IR.Trie)(E* e, S* state)
656     {
with(state)657         with(e) with(state)
658         {
659             if (re.matchers[re.ir[t.pc].data][front])
660             {
661                 t.pc += IRL!(IR.Trie);
662                 nlist.insertBack(t);
663             }
664             else
665             {
666                 recycle(t);
667             }
668             t = worklist.fetch();
669             return t != null;
670         }
671     }
672 
673 }
674 
675 template ThompsonOps(E,S, bool withInput:false)
676 {
677 @trusted:
678     // can't match these without input
679     static bool op(IR code)(E* e, S* state)
680         if (code == IR.Char || code == IR.OrChar || code == IR.CodepointSet
681         || code == IR.Trie || code == IR.Char || code == IR.Any)
682     {
683         return state.popState(e);
684     }
685 
686     // special case of zero-width backref
687     static bool op(IR code:IR.Backref)(E* e, S* state)
688     {
with(state)689         with(e) with(state)
690         {
691             uint n = re.ir[t.pc].data;
692             Group!DataIndex* source = re.ir[t.pc].localRef ? t.matches.ptr : backrefed.ptr;
693             assert(source);
694             if (source[n].begin == source[n].end)//zero-width Backref!
695             {
696                 t.pc += IRL!(IR.Backref);
697                 return true;
698             }
699             else
700                 return popState(e);
701         }
702     }
703 
704     // forward all control flow to normal versions
705     static bool op(IR code)(E* e, S* state)
706         if (code != IR.Char && code != IR.OrChar && code != IR.CodepointSet
707         && code != IR.Trie && code != IR.Char && code != IR.Any && code != IR.Backref)
708     {
709         return ThompsonOps!(E,S,true).op!code(e,state);
710     }
711 }
712 
713 /+
714    Thomspon matcher does all matching in lockstep,
715    never looking at the same char twice
716 +/
717 @trusted struct ThompsonMatcher(Char, StreamType = Input!Char)
718 if (is(Char : dchar))
719 {
720     alias DataIndex = Stream.DataIndex;
721     alias Stream = StreamType;
722     alias OpFunc = bool function(ThompsonMatcher*, State*);
723     alias BackMatcher = ThompsonMatcher!(Char, BackLooper!(Stream));
724     alias OpBackFunc = bool function(BackMatcher*, BackMatcher.State*);
725     Thread!DataIndex* freelist;
726     ThreadList!DataIndex clist, nlist;
727     DataIndex[] merge;
728     Group!DataIndex[] backrefed;
729     Regex!Char re;           //regex program
730     Stream s;
731     dchar front;
732     DataIndex index;
733     DataIndex genCounter;    //merge trace counter, goes up on every dchar
734     size_t[size_t] subCounters; //a table of gen counter per sub-engine: PC -> counter
735     OpFunc[] opCacheTrue;   // pointers to Op!(IR.xyz) for each bytecode
736     OpFunc[] opCacheFalse;  // ditto
737     OpBackFunc[] opCacheBackTrue;   // ditto
738     OpBackFunc[] opCacheBackFalse;  // ditto
739     size_t threadSize;
740     int matched;
741     bool exhausted;
742 
743     static struct State
744     {
745         Thread!DataIndex* t;
746         ThreadList!DataIndex worklist;
747         Group!DataIndex[] matches;
748 
popStateThompsonMatcher::State749         bool popState(E)(E* e)
750         {
751             with(e)
752             {
753                 recycle(t);
754                 t = worklist.fetch();
755                 return t != null;
756             }
757         }
758 
759     }
760 
761     static if (__traits(hasMember,Stream, "search"))
762     {
763         enum kicked = true;
764     }
765     else
766         enum kicked = false;
767 
768     static size_t getThreadSize(const ref Regex!Char re)
769     {
770         return re.ngroup
771             ? (Thread!DataIndex).sizeof + (re.ngroup-1)*(Group!DataIndex).sizeof
772             : (Thread!DataIndex).sizeof - (Group!DataIndex).sizeof;
773     }
774 
775     static size_t initialMemory(const ref Regex!Char re)
776     {
777         return getThreadSize(re)*re.threadCount + re.hotspotTableSize*size_t.sizeof
778             +4*OpFunc.sizeof*re.ir.length;
779     }
780 
781     //true if it's start of input
atStartThompsonMatcher782     @property bool atStart(){   return index == 0; }
783 
784     //true if it's end of input
atEndThompsonMatcher785     @property bool atEnd(){  return index == s.lastIndex && s.atEnd; }
786 
nextThompsonMatcher787     bool next()
788     {
789         if (!s.nextChar(front, index))
790         {
791             index =  s.lastIndex;
792             return false;
793         }
794         return true;
795     }
796 
797     static if (kicked)
798     {
searchThompsonMatcher799         bool search()
800         {
801 
802             if (!s.search(re.kickstart, front, index))
803             {
804                 index = s.lastIndex;
805                 return false;
806             }
807             return true;
808         }
809     }
810 
initExternalMemoryThompsonMatcher811     void initExternalMemory(void[] memory)
812     {
813         threadSize = getThreadSize(re);
814         prepareFreeList(re.threadCount, memory);
815         if (re.hotspotTableSize)
816         {
817             merge = arrayInChunk!(DataIndex)(re.hotspotTableSize, memory);
818             merge[] = 0;
819         }
820         opCacheTrue = arrayInChunk!(OpFunc)(re.ir.length, memory);
821         opCacheFalse = arrayInChunk!(OpFunc)(re.ir.length, memory);
822         opCacheBackTrue = arrayInChunk!(OpBackFunc)(re.ir.length, memory);
823         opCacheBackFalse = arrayInChunk!(OpBackFunc)(re.ir.length, memory);
824 
825         for (uint pc = 0; pc<re.ir.length; pc += re.ir[pc].length)
826         {
827         L_dispatch:
828             switch (re.ir[pc].code)
829             {
830                 foreach (e; __traits(allMembers, IR))
831                 {
832             mixin(`case IR.`~e~`:
833                     opCacheTrue[pc] = &Ops!(true).op!(IR.`~e~`);
834                     opCacheBackTrue[pc] = &BackOps!(true).op!(IR.`~e~`);
835                     opCacheFalse[pc] = &Ops!(false).op!(IR.`~e~`);
836                     opCacheBackFalse[pc] = &BackOps!(false).op!(IR.`~e~`);
837                 break L_dispatch;
838                 `);
839                 }
840             default:
841                 assert(0, "Unrecognized instruction "~re.ir[pc].mnemonic);
842             }
843         }
844     }
845 
thisThompsonMatcher846     this()(Regex!Char program, Stream stream, void[] memory)
847     {
848         re = program;
849         s = stream;
850         initExternalMemory(memory);
851         genCounter = 0;
852     }
853 
thisThompsonMatcher854     this(ref ThompsonMatcher matcher, size_t lo, size_t hi, Stream stream)
855     {
856         s = stream;
857         re = matcher.re;
858         re.ir = re.ir[lo .. hi];
859         threadSize = matcher.threadSize;
860         merge = matcher.merge;
861         freelist = matcher.freelist;
862         opCacheTrue = matcher.opCacheTrue[lo .. hi];
863         opCacheBackTrue = matcher.opCacheBackTrue[lo .. hi];
864         opCacheFalse = matcher.opCacheFalse[lo .. hi];
865         opCacheBackFalse = matcher.opCacheBackFalse[lo .. hi];
866         front = matcher.front;
867         index = matcher.index;
868     }
869 
thisThompsonMatcher870     this(ref BackMatcher matcher, size_t lo, size_t hi, Stream stream)
871     {
872         s = stream;
873         re = matcher.re;
874         re.ir = re.ir[lo .. hi];
875         threadSize = matcher.threadSize;
876         merge = matcher.merge;
877         freelist = matcher.freelist;
878         opCacheTrue = matcher.opCacheBackTrue[lo .. hi];
879         opCacheBackTrue = matcher.opCacheTrue[lo .. hi];
880         opCacheFalse = matcher.opCacheBackFalse[lo .. hi];
881         opCacheBackFalse = matcher.opCacheFalse[lo .. hi];
882         front = matcher.front;
883         index = matcher.index;
884     }
885 
fwdMatcherThompsonMatcher886     auto fwdMatcher()(size_t lo, size_t hi, size_t counter)
887     {
888         auto m = ThompsonMatcher!(Char, Stream)(this, lo, hi, s);
889         m.genCounter = counter;
890         return m;
891     }
892 
bwdMatcherThompsonMatcher893     auto bwdMatcher()(size_t lo, size_t hi, size_t counter)
894     {
895         alias BackLooper = typeof(s.loopBack(index));
896         auto m = ThompsonMatcher!(Char, BackLooper)(this, lo, hi, s.loopBack(index));
897         m.genCounter = counter;
898         m.next();
899         return m;
900     }
901 
dupToThompsonMatcher902     auto dupTo(void[] memory)
903     {
904         typeof(this) tmp = this;//bitblit
905         tmp.initExternalMemory(memory);
906         tmp.genCounter = 0;
907         return tmp;
908     }
909 
910     int match(Group!DataIndex[] matches)
911     {
912         debug(std_regex_matcher)
913             writeln("------------------------------------------");
914         if (exhausted)
915         {
916             return false;
917         }
918         if (re.flags & RegexInfo.oneShot)
919         {
920             next();
921             exhausted = true;
922             return matchOneShot(matches);
923         }
924         static if (kicked)
925             if (!re.kickstart.empty)
926                 return matchImpl!(true)(matches);
927         return matchImpl!(false)(matches);
928     }
929 
930     //match the input and fill matches
matchImplThompsonMatcher931     int matchImpl(bool withSearch)(Group!DataIndex[] matches)
932     {
933         if (!matched && clist.empty)
934         {
935            static if (withSearch)
936                 search();
937            else
938                 next();
939         }
940         else//char in question is  fetched in prev call to match
941         {
942             matched = 0;
943         }
944         State state;
945         state.matches = matches;
946 
947         if (!atEnd)//if no char
948             for (;;)
949             {
950                 genCounter++;
951                 debug(std_regex_matcher)
952                 {
953                     writefln("Threaded matching threads at  %s", s[index .. s.lastIndex]);
954                     foreach (t; clist[])
955                     {
956                         assert(t);
957                         writef("pc=%s ",t.pc);
958                         write(t.matches);
959                         writeln();
960                     }
961                 }
962                 for (state.t = clist.fetch(); state.t; state.t = clist.fetch())
963                 {
964                     eval!true(&state);
965                 }
966                 //if we already have match no need to push the engine
967                 if (!matched)
968                 {
969                     state.t = createStart(index);
970                     eval!true(&state);//new thread staring at this position
971                 }
972                 else if (nlist.empty)
973                 {
974                     debug(std_regex_matcher) writeln("Stopped  matching before consuming full input");
975                     break;//not a partial match for sure
976                 }
977                 clist = nlist;
978                 nlist = (ThreadList!DataIndex).init;
979                 if (clist.tip is null)
980                 {
981                     static if (withSearch)
982                     {
983                         if (!search())
984                             break;
985                     }
986                     else
987                     {
988                         if (!next())
989                             break;
990                     }
991                 }
992                 else if (!next())
993                 {
994                     if (!atEnd) return false;
995                     exhausted = true;
996                     break;
997                 }
998             }
999 
1000         genCounter++; //increment also on each end
1001         debug(std_regex_matcher) writefln("Threaded matching threads at end");
1002         //try out all zero-width posibilities
1003         for (state.t = clist.fetch(); state.t; state.t = clist.fetch())
1004         {
1005             eval!false(&state);
1006         }
1007         if (!matched)
1008         {
1009             state.t = createStart(index);
1010             eval!false(&state);//new thread starting at end of input
1011         }
1012         if (matched)
1013         {//in case NFA found match along the way
1014          //and last possible longer alternative ultimately failed
1015             s.reset(matches[0].end);//reset to last successful match
1016             next();//and reload front character
1017             //--- here the exact state of stream was restored ---
1018             exhausted = atEnd || !(re.flags & RegexOption.global);
1019             //+ empty match advances the input
1020             if (!exhausted && matches[0].begin == matches[0].end)
1021                 next();
1022         }
1023         return matched;
1024     }
1025 
1026     /+
1027         handle succesful threads
1028     +/
1029     void finish(const(Thread!DataIndex)* t, Group!DataIndex[] matches, int code)
1030     {
1031         matches.ptr[0 .. re.ngroup] = t.matches.ptr[0 .. re.ngroup];
debugThompsonMatcher1032         debug(std_regex_matcher)
1033         {
1034             writef("FOUND pc=%s prog_len=%s",
1035                     t.pc, re.ir.length);
1036             if (!matches.empty)
1037                 writefln(": %s..%s", matches[0].begin, matches[0].end);
1038             foreach (v; matches)
1039                 writefln("%d .. %d", v.begin, v.end);
1040         }
1041         matched = code;
1042     }
1043 
1044     alias Ops(bool withInput) =  ThompsonOps!(ThompsonMatcher, State, withInput);
1045     alias BackOps(bool withInput) =  ThompsonOps!(BackMatcher, BackMatcher.State, withInput);
1046 
1047     /+
1048         match thread against codepoint, cutting trough all 0-width instructions
1049         and taking care of control flow, then add it to nlist
1050     +/
evalThompsonMatcher1051     void eval(bool withInput)(State* state)
1052     {
1053         debug(std_regex_matcher) writeln("---- Evaluating thread");
1054         static if (withInput)
1055             while (opCacheTrue.ptr[state.t.pc](&this, state)){}
1056         else
1057             while (opCacheFalse.ptr[state.t.pc](&this, state)){}
1058     }
1059     enum uint RestartPc = uint.max;
1060     //match the input, evaluating IR without searching
1061     int matchOneShot(Group!DataIndex[] matches, uint startPc = 0)
1062     {
debugThompsonMatcher1063         debug(std_regex_matcher)
1064         {
1065             writefln("---------------single shot match ----------------- ");
1066         }
1067         alias evalFn = eval;
1068         assert(clist == (ThreadList!DataIndex).init || startPc == RestartPc); // incorrect after a partial match
1069         assert(nlist == (ThreadList!DataIndex).init || startPc == RestartPc);
1070         State state;
1071         state.matches = matches;
1072         if (!atEnd)//if no char
1073         {
debugThompsonMatcher1074             debug(std_regex_matcher)
1075             {
1076                 writefln("-- Threaded matching threads at  %s",  s[index .. s.lastIndex]);
1077             }
1078             if (startPc != RestartPc)
1079             {
1080                 state.t = createStart(index, startPc);
1081                 genCounter++;
1082                 evalFn!true(&state);
1083             }
1084             for (;;)
1085             {
1086                 debug(std_regex_matcher) writeln("\n-- Started iteration of main cycle");
1087                 genCounter++;
debugThompsonMatcher1088                 debug(std_regex_matcher)
1089                 {
1090                     foreach (t; clist[])
1091                     {
1092                         assert(t);
1093                     }
1094                 }
1095                 for (state.t = clist.fetch(); state.t; state.t = clist.fetch())
1096                 {
1097                     evalFn!true(&state);
1098                 }
1099                 if (nlist.empty)
1100                 {
1101                     debug(std_regex_matcher) writeln("Stopped  matching before consuming full input");
1102                     break;//not a partial match for sure
1103                 }
1104                 clist = nlist;
1105                 nlist = (ThreadList!DataIndex).init;
1106                 if (!next())
1107                     break;
1108                 debug(std_regex_matcher) writeln("-- Ended iteration of main cycle\n");
1109             }
1110         }
1111         genCounter++; //increment also on each end
1112         debug(std_regex_matcher) writefln("-- Matching threads at end");
1113         //try out all zero-width posibilities
1114         for (state.t = clist.fetch(); state.t; state.t = clist.fetch())
1115         {
1116             evalFn!false(&state);
1117         }
1118         if (!matched)
1119         {
1120             state.t = createStart(index, startPc);
1121             evalFn!false(&state);
1122         }
1123         return matched;
1124     }
1125 
1126     //get a dirty recycled Thread
1127     Thread!DataIndex* allocate()
1128     {
1129         assert(freelist, "not enough preallocated memory");
1130         Thread!DataIndex* t = freelist;
1131         freelist = freelist.next;
1132         return t;
1133     }
1134 
1135     //link memory into a free list of Threads
prepareFreeListThompsonMatcher1136     void prepareFreeList(size_t size, ref void[] memory)
1137     {
1138         void[] mem = memory[0 .. threadSize*size];
1139         memory = memory[threadSize * size .. $];
1140         freelist = cast(Thread!DataIndex*)&mem[0];
1141         size_t i;
1142         for (i = threadSize; i < threadSize*size; i += threadSize)
1143             (cast(Thread!DataIndex*)&mem[i-threadSize]).next = cast(Thread!DataIndex*)&mem[i];
1144         (cast(Thread!DataIndex*)&mem[i-threadSize]).next = null;
1145     }
1146 
1147     //dispose a thread
1148     void recycle(Thread!DataIndex* t)
1149     {
1150         t.next = freelist;
1151         freelist = t;
1152     }
1153 
1154     //dispose list of threads
1155     void recycle(ref ThreadList!DataIndex list)
1156     {
1157         if (list.tip)
1158         {
1159             // just put this head-tail list in front of freelist
1160             list.toe.next = freelist;
1161             freelist = list.tip;
1162             list = list.init;
1163         }
1164     }
1165 
1166     //creates a copy of master thread with given pc
1167     Thread!DataIndex* fork(Thread!DataIndex* master, uint pc, uint counter)
1168     {
1169         auto t = allocate();
1170         t.matches.ptr[0 .. re.ngroup] = master.matches.ptr[0 .. re.ngroup];
1171         t.pc = pc;
1172         t.counter = counter;
1173         t.uopCounter = 0;
1174         return t;
1175     }
1176 
1177     //creates a start thread
1178     Thread!DataIndex* createStart(DataIndex index, uint pc = 0)
1179     {
1180         auto t = allocate();
1181         t.matches.ptr[0 .. re.ngroup] = (Group!DataIndex).init;
1182         t.matches[0].begin = index;
1183         t.pc = pc;
1184         t.counter = 0;
1185         t.uopCounter = 0;
1186         return t;
1187     }
1188 }
1189