xref: /netbsd-src/external/gpl3/gcc/dist/libphobos/src/std/regex/internal/thompson.d (revision b1e838363e3c6fc78a55519254d99869742dd33c)
1 //Written in the D programming language
2 /*
3     Implementation of Thompson NFA std.regex engine.
4     Key point is evaluation of all possible threads (state) at each step
5     in a breadth-first manner, thereby geting some nice properties:
6         - looking at each character only once
7         - merging of equivalent threads, that gives matching process linear time complexity
8 */
9 module std.regex.internal.thompson;
10 
11 package(std.regex):
12 
13 import std.range.primitives;
14 import std.regex.internal.ir;
15 
16 //State of VM thread
Thread(DataIndex)17 struct Thread(DataIndex)
18 {
19     Thread* next;    //intrusive linked list
20     uint pc;
21     uint counter;    //loop counter
22     uint uopCounter; //counts micro operations inside one macro instruction (e.g. BackRef)
23     Group!DataIndex[1] matches;
24 }
25 
26 //head-tail singly-linked list
ThreadList(DataIndex)27 struct ThreadList(DataIndex)
28 {
29     Thread!DataIndex* tip = null, toe = null;
30     //add new thread to the start of list
31     void insertFront(Thread!DataIndex* t)
32     {
33         if (tip)
34         {
35             t.next = tip;
36             tip = t;
37         }
38         else
39         {
40             t.next = null;
41             tip = toe = t;
42         }
43     }
44     //add new thread to the end of list
45     void insertBack(Thread!DataIndex* t)
46     {
47         if (toe)
48         {
49             toe.next = t;
50             toe = t;
51         }
52         else
53             tip = toe = t;
54         toe.next = null;
55     }
56     //move head element out of list
57     Thread!DataIndex* fetch()
58     {
59         auto t = tip;
60         if (tip == toe)
61             tip = toe = null;
62         else
63             tip = tip.next;
64         return t;
65     }
66     //non-destructive iteration of ThreadList
67     struct ThreadRange
68     {
69         const(Thread!DataIndex)* ct;
70         this(ThreadList tlist){ ct = tlist.tip; }
71         @property bool empty(){ return ct is null; }
72         @property const(Thread!DataIndex)* front(){ return ct; }
73         void popFront()
74         {
75             assert(ct);
76             ct = ct.next;
77         }
78     }
79     @property bool empty()
80     {
81         return tip == null;
82     }
83     ThreadRange opSlice()
84     {
85         return ThreadRange(this);
86     }
87 }
88 
89 template ThompsonOps(E, S, bool withInput:true)
90 {
91 @trusted:
92     static bool op(IR code:IR.End)(E e, S* state)
93     {
with(state)94         with(e) with(state)
95         {
96             finish(t, matches, re.ir[t.pc].data);
97             //fix endpoint of the whole match
98             matches[0].end = index;
99             recycle(t);
100             //cut off low priority threads
101             recycle(clist);
102             recycle(worklist);
103             debug(std_regex_matcher) writeln("Finished thread ", matches);
104             return false; // no more state to eval
105         }
106     }
107 
108     static bool op(IR code:IR.Wordboundary)(E e, S* state)
109     {
with(state)110         with(e) with(state)
111         {
112             dchar back;
113             DataIndex bi;
114             //at start & end of input
115             if (atStart && wordMatcher[front])
116             {
117                 t.pc += IRL!(IR.Wordboundary);
118                 return true;
119             }
120             else if (atEnd && s.loopBack(index).nextChar(back, bi)
121                     && wordMatcher[back])
122             {
123                 t.pc += IRL!(IR.Wordboundary);
124                 return true;
125             }
126             else if (s.loopBack(index).nextChar(back, bi))
127             {
128                 bool af = wordMatcher[front];
129                 bool ab = wordMatcher[back];
130                 if (af ^ ab)
131                 {
132                     t.pc += IRL!(IR.Wordboundary);
133                     return true;
134                 }
135             }
136             return popState(e);
137         }
138     }
139 
140     static bool op(IR code:IR.Notwordboundary)(E e, S* state)
141     {
with(state)142         with(e) with(state)
143         {
144             dchar back;
145             DataIndex bi;
146             //at start & end of input
147             if (atStart && wordMatcher[front])
148             {
149                 return popState(e);
150             }
151             else if (atEnd && s.loopBack(index).nextChar(back, bi)
152                     && wordMatcher[back])
153             {
154                 return popState(e);
155             }
156             else if (s.loopBack(index).nextChar(back, bi))
157             {
158                 bool af = wordMatcher[front];
159                 bool ab = wordMatcher[back]  != 0;
160                 if (af ^ ab)
161                 {
162                     return popState(e);
163                 }
164             }
165             t.pc += IRL!(IR.Notwordboundary);
166         }
167         return true;
168     }
169 
170     static bool op(IR code:IR.Bof)(E e, S* state)
171     {
with(state)172         with(e) with(state)
173         {
174             if (atStart)
175             {
176                 t.pc += IRL!(IR.Bof);
177                 return true;
178             }
179             else
180             {
181                 return popState(e);
182             }
183         }
184     }
185 
186     static bool op(IR code:IR.Bol)(E e, S* state)
187     {
with(state)188         with(e) with(state)
189         {
190             dchar back;
191             DataIndex bi;
192             if (atStart
193                 ||(s.loopBack(index).nextChar(back,bi)
194                 && startOfLine(back, front == '\n')))
195             {
196                 t.pc += IRL!(IR.Bol);
197                 return true;
198             }
199             else
200             {
201                 return popState(e);
202             }
203         }
204     }
205 
206     static bool op(IR code:IR.Eof)(E e, S* state)
207     {
with(state)208         with(e) with(state)
209         {
210             if (atEnd)
211             {
212                 t.pc += IRL!(IR.Eol);
213                 return true;
214             }
215             else
216             {
217                 return popState(e);
218             }
219         }
220     }
221 
222     static bool op(IR code:IR.Eol)(E e, S* state)
223     {
with(state)224         with(e) with(state)
225         {
226             dchar back;
227             DataIndex bi;
228             //no matching inside \r\n
229             if (atEnd || (endOfLine(front, s.loopBack(index).nextChar(back, bi)
230                     && back == '\r')))
231             {
232                 t.pc += IRL!(IR.Eol);
233                 return true;
234             }
235             else
236             {
237                 return popState(e);
238             }
239 
240         }
241     }
242 
243     static bool op(IR code:IR.InfiniteStart)(E e, S* state)
244     {
245         with(e) with(state)
246             t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteStart);
247         return op!(IR.InfiniteEnd)(e,state);
248     }
249 
250     static bool op(IR code:IR.InfiniteBloomStart)(E e, S* state)
251     {
252         with(e) with(state)
253             t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteBloomStart);
254         return op!(IR.InfiniteBloomEnd)(e,state);
255     }
256 
257     static bool op(IR code:IR.InfiniteQStart)(E e, S* state)
258     {
259         with(e) with(state)
260             t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteQStart);
261         return op!(IR.InfiniteQEnd)(e,state);
262     }
263 
264     static bool op(IR code:IR.RepeatStart)(E e, S* state)
265     {
266         with(e) with(state)
267             t.pc += re.ir[t.pc].data + IRL!(IR.RepeatStart);
268         return op!(IR.RepeatEnd)(e,state);
269     }
270 
271     static bool op(IR code:IR.RepeatQStart)(E e, S* state)
272     {
273         with(e) with(state)
274             t.pc += re.ir[t.pc].data + IRL!(IR.RepeatQStart);
275         return op!(IR.RepeatQEnd)(e,state);
276     }
277 
278     static bool op(IR code)(E e, S* state)
279         if (code == IR.RepeatEnd || code == IR.RepeatQEnd)
280     {
with(state)281         with(e) with(state)
282         {
283             //len, step, min, max
284                 uint len = re.ir[t.pc].data;
285                 uint step =  re.ir[t.pc+2].raw;
286                 uint min = re.ir[t.pc+3].raw;
287                 if (t.counter < min)
288                 {
289                     t.counter += step;
290                     t.pc -= len;
291                     return true;
292                 }
293                 if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter)
294                 {
295                     debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s",
296                                     t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
297                     merge[re.ir[t.pc + 1].raw+t.counter] = genCounter;
298                 }
299                 else
300                 {
301                     debug(std_regex_matcher)
302                         writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s",
303                             t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
304                     return popState(e);
305                 }
306                 uint max = re.ir[t.pc+4].raw;
307                 if (t.counter < max)
308                 {
309                     if (re.ir[t.pc].code == IR.RepeatEnd)
310                     {
311                         //queue out-of-loop thread
312                         worklist.insertFront(fork(t, t.pc + IRL!(IR.RepeatEnd),  t.counter % step));
313                         t.counter += step;
314                         t.pc -= len;
315                     }
316                     else
317                     {
318                         //queue into-loop thread
319                         worklist.insertFront(fork(t, t.pc - len,  t.counter + step));
320                         t.counter %= step;
321                         t.pc += IRL!(IR.RepeatEnd);
322                     }
323                 }
324                 else
325                 {
326                     t.counter %= step;
327                     t.pc += IRL!(IR.RepeatEnd);
328                 }
329                 return true;
330         }
331     }
332 
333     static bool op(IR code)(E e, S* state)
334         if (code == IR.InfiniteEnd || code == IR.InfiniteQEnd)
335     {
with(state)336         with(e) with(state)
337         {
338             if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter)
339             {
340                 debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s",
341                                 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
342                 merge[re.ir[t.pc + 1].raw+t.counter] = genCounter;
343             }
344             else
345             {
346                 debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s",
347                                 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
348                 return popState(e);
349             }
350             uint len = re.ir[t.pc].data;
351             uint pc1, pc2; //branches to take in priority order
352             if (re.ir[t.pc].code == IR.InfiniteEnd)
353             {
354                 pc1 = t.pc - len;
355                 pc2 = t.pc + IRL!(IR.InfiniteEnd);
356             }
357             else
358             {
359                 pc1 = t.pc + IRL!(IR.InfiniteEnd);
360                 pc2 = t.pc - len;
361             }
362             worklist.insertFront(fork(t, pc2, t.counter));
363             t.pc = pc1;
364             return true;
365         }
366     }
367 
368     static bool op(IR code)(E e, S* state)
369         if (code == IR.InfiniteBloomEnd)
370     {
with(state)371         with(e) with(state)
372         {
373             if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter)
374             {
375                 debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s",
376                                 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
377                 merge[re.ir[t.pc + 1].raw+t.counter] = genCounter;
378             }
379             else
380             {
381                 debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s",
382                                 t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] );
383                 return popState(e);
384             }
385             uint len = re.ir[t.pc].data;
386             uint pc1, pc2; //branches to take in priority order
387             pc1 = t.pc - len;
388             pc2 = t.pc + IRL!(IR.InfiniteBloomEnd);
389             uint filterIndex = re.ir[t.pc + 2].raw;
390             if (re.filters[filterIndex][front])
391                 worklist.insertFront(fork(t, pc2, t.counter));
392             t.pc = pc1;
393             return true;
394         }
395     }
396 
397     static bool op(IR code:IR.OrEnd)(E e, S* state)
398     {
with(state)399         with(e) with(state)
400         {
401             if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter)
402             {
403                 debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s",
404                                 t.pc, s[index .. s.lastIndex], genCounter, merge[re.ir[t.pc + 1].raw + t.counter] );
405                 merge[re.ir[t.pc + 1].raw+t.counter] = genCounter;
406                 t.pc += IRL!(IR.OrEnd);
407             }
408             else
409             {
410                 debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s",
411                                 t.pc, s[index .. s.lastIndex], genCounter, merge[re.ir[t.pc + 1].raw + t.counter] );
412                 return popState(e);
413             }
414             return true;
415         }
416     }
417 
418     static bool op(IR code:IR.OrStart)(E e, S* state)
419     {
with(state)420         with(e) with(state)
421         {
422             t.pc += IRL!(IR.OrStart);
423             return op!(IR.Option)(e,state);
424         }
425     }
426 
427     static bool op(IR code:IR.Option)(E e, S* state)
428     {
with(state)429         with(e) with(state)
430         {
431             uint next = t.pc + re.ir[t.pc].data + IRL!(IR.Option);
432             //queue next Option
433             if (re.ir[next].code == IR.Option)
434             {
435                 worklist.insertFront(fork(t, next, t.counter));
436             }
437             t.pc += IRL!(IR.Option);
438             return true;
439         }
440     }
441 
442     static bool op(IR code:IR.GotoEndOr)(E e, S* state)
443     {
with(state)444         with(e) with(state)
445         {
446             t.pc = t.pc + re.ir[t.pc].data + IRL!(IR.GotoEndOr);
447             return op!(IR.OrEnd)(e, state);
448         }
449     }
450 
451     static bool op(IR code:IR.GroupStart)(E e, S* state)
452     {
with(state)453         with(e) with(state)
454         {
455             uint n = re.ir[t.pc].data;
456             t.matches.ptr[n].begin = index;
457             t.pc += IRL!(IR.GroupStart);
458             return true;
459         }
460     }
461     static bool op(IR code:IR.GroupEnd)(E e, S* state)
462     {
with(state)463         with(e) with(state)
464         {
465             uint n = re.ir[t.pc].data;
466             t.matches.ptr[n].end = index;
467             t.pc += IRL!(IR.GroupEnd);
468             return true;
469         }
470     }
471 
472     static bool op(IR code:IR.Backref)(E e, S* state)
473     {
with(state)474         with(e) with(state)
475         {
476             uint n = re.ir[t.pc].data;
477             Group!DataIndex* source = re.ir[t.pc].localRef ? t.matches.ptr : backrefed.ptr;
478             assert(source);
479             if (source[n].begin == source[n].end)//zero-width Backref!
480             {
481                 t.pc += IRL!(IR.Backref);
482                 return true;
483             }
484             else
485             {
486                 size_t idx = source[n].begin + t.uopCounter;
487                 size_t end = source[n].end;
488                 if (s[idx .. end].front == front)
489                 {
490                     import std.utf : stride;
491 
492                     t.uopCounter += stride(s[idx .. end], 0);
493                     if (t.uopCounter + source[n].begin == source[n].end)
494                     {//last codepoint
495                         t.pc += IRL!(IR.Backref);
496                         t.uopCounter = 0;
497                     }
498                     nlist.insertBack(t);
499                 }
500                 else
501                     recycle(t);
502                 t = worklist.fetch();
503                 return t != null;
504             }
505         }
506     }
507 
508 
509     static bool op(IR code)(E e, S* state)
510         if (code == IR.LookbehindStart || code == IR.NeglookbehindStart)
511     {
with(state)512         with(e) with(state)
513         {
514             uint len = re.ir[t.pc].data;
515             uint ms = re.ir[t.pc + 1].raw, me = re.ir[t.pc + 2].raw;
516             uint end = t.pc + len + IRL!(IR.LookbehindEnd) + IRL!(IR.LookbehindStart);
517             bool positive = re.ir[t.pc].code == IR.LookbehindStart;
518             static if (Stream.isLoopback)
519                 auto matcher = fwdMatcher(t.pc, end, me - ms, subCounters.get(t.pc, 0));
520             else
521                 auto matcher = bwdMatcher(t.pc, end, me - ms, subCounters.get(t.pc, 0));
522             matcher.backrefed = backrefed.empty ? t.matches : backrefed;
523             //backMatch
524             auto mRes = matcher.matchOneShot(t.matches.ptr[ms .. me], IRL!(IR.LookbehindStart));
525             freelist = matcher.freelist;
526             subCounters[t.pc] = matcher.genCounter;
527             if ((mRes != 0 ) ^ positive)
528             {
529                 return popState(e);
530             }
531             t.pc = end;
532             return true;
533         }
534     }
535 
536     static bool op(IR code)(E e, S* state)
537         if (code == IR.LookaheadStart || code == IR.NeglookaheadStart)
538     {
with(state)539         with(e) with(state)
540         {
541             auto save = index;
542             uint len = re.ir[t.pc].data;
543             uint ms = re.ir[t.pc+1].raw, me = re.ir[t.pc+2].raw;
544             uint end = t.pc+len+IRL!(IR.LookaheadEnd)+IRL!(IR.LookaheadStart);
545             bool positive = re.ir[t.pc].code == IR.LookaheadStart;
546             static if (Stream.isLoopback)
547                 auto matcher = bwdMatcher(t.pc, end, me - ms, subCounters.get(t.pc, 0));
548             else
549                 auto matcher = fwdMatcher(t.pc, end, me - ms, subCounters.get(t.pc, 0));
550             matcher.backrefed = backrefed.empty ? t.matches : backrefed;
551             auto mRes = matcher.matchOneShot(t.matches.ptr[ms .. me], IRL!(IR.LookaheadStart));
552             freelist = matcher.freelist;
553             subCounters[t.pc] = matcher.genCounter;
554             s.reset(index);
555             next();
556             if ((mRes != 0) ^ positive)
557             {
558                 return popState(e);
559             }
560             t.pc = end;
561             return true;
562         }
563     }
564 
565     static bool op(IR code)(E e, S* state)
566         if (code == IR.LookaheadEnd || code == IR.NeglookaheadEnd ||
567             code == IR.LookbehindEnd || code == IR.NeglookbehindEnd)
568     {
with(state)569         with(e) with(state)
570         {
571                 finish(t, matches.ptr[0 .. re.ngroup], re.ir[t.pc].data);
572                 recycle(t);
573                 //cut off low priority threads
574                 recycle(clist);
575                 recycle(worklist);
576                 return false; // no more state
577         }
578     }
579 
580     static bool op(IR code:IR.Nop)(E e, S* state)
581     {
582         with(state) t.pc += IRL!(IR.Nop);
583         return true;
584     }
585 
586     static bool op(IR code:IR.OrChar)(E e, S* state)
587     {
with(state)588         with(e) with(state)
589         {
590             uint len = re.ir[t.pc].sequence;
591             uint end = t.pc + len;
592             static assert(IRL!(IR.OrChar) == 1);
593             for (; t.pc < end; t.pc++)
594                 if (re.ir[t.pc].data == front)
595                     break;
596             if (t.pc != end)
597             {
598                 t.pc = end;
599                 nlist.insertBack(t);
600             }
601             else
602                 recycle(t);
603             t = worklist.fetch();
604             return t != null;
605         }
606     }
607 
608     static bool op(IR code:IR.Char)(E e, S* state)
609     {
with(state)610         with(e) with(state)
611         {
612             if (front == re.ir[t.pc].data)
613             {
614                 t.pc += IRL!(IR.Char);
615                 nlist.insertBack(t);
616             }
617             else
618                 recycle(t);
619             t = worklist.fetch();
620             return t != null;
621         }
622     }
623 
624     static bool op(IR code:IR.Any)(E e, S* state)
625     {
with(state)626         with(e) with(state)
627         {
628             t.pc += IRL!(IR.Any);
629             nlist.insertBack(t);
630             t = worklist.fetch();
631             return t != null;
632         }
633     }
634 
635     static bool op(IR code:IR.CodepointSet)(E e, S* state)
636     {
with(state)637         with(e) with(state)
638         {
639             if (re.charsets[re.ir[t.pc].data].scanFor(front))
640             {
641                 t.pc += IRL!(IR.CodepointSet);
642                 nlist.insertBack(t);
643             }
644             else
645             {
646                 recycle(t);
647             }
648             t = worklist.fetch();
649             return t != null;
650         }
651     }
652 
653     static bool op(IR code:IR.Trie)(E e, S* state)
654     {
with(state)655         with(e) with(state)
656         {
657             if (re.matchers[re.ir[t.pc].data][front])
658             {
659                 t.pc += IRL!(IR.Trie);
660                 nlist.insertBack(t);
661             }
662             else
663             {
664                 recycle(t);
665             }
666             t = worklist.fetch();
667             return t != null;
668         }
669     }
670 
671 }
672 
673 template ThompsonOps(E,S, bool withInput:false)
674 {
675 @trusted:
676     // can't match these without input
677     static bool op(IR code)(E e, S* state)
678         if (code == IR.Char || code == IR.OrChar || code == IR.CodepointSet
679         || code == IR.Trie || code == IR.Char || code == IR.Any)
680     {
681         return state.popState(e);
682     }
683 
684     // special case of zero-width backref
685     static bool op(IR code:IR.Backref)(E e, S* state)
686     {
with(state)687         with(e) with(state)
688         {
689             uint n = re.ir[t.pc].data;
690             Group!DataIndex* source = re.ir[t.pc].localRef ? t.matches.ptr : backrefed.ptr;
691             assert(source);
692             if (source[n].begin == source[n].end)//zero-width Backref!
693             {
694                 t.pc += IRL!(IR.Backref);
695                 return true;
696             }
697             else
698                 return popState(e);
699         }
700     }
701 
702     // forward all control flow to normal versions
703     static bool op(IR code)(E e, S* state)
704         if (code != IR.Char && code != IR.OrChar && code != IR.CodepointSet
705         && code != IR.Trie && code != IR.Char && code != IR.Any && code != IR.Backref)
706     {
707         return ThompsonOps!(E,S,true).op!code(e,state);
708     }
709 }
710 
711 /+
712    Thomspon matcher does all matching in lockstep,
713    never looking at the same char twice
714 +/
715 @trusted class ThompsonMatcher(Char, StreamType = Input!Char): Matcher!Char
716 if (is(Char : dchar))
717 {
718     alias DataIndex = Stream.DataIndex;
719     alias Stream = StreamType;
720     alias OpFunc = bool function(ThompsonMatcher, State*) pure;
721     alias BackMatcher = ThompsonMatcher!(Char, BackLooper!(Stream));
722     alias OpBackFunc = bool function(BackMatcher, BackMatcher.State*) pure;
723     Thread!DataIndex* freelist;
724     ThreadList!DataIndex clist, nlist;
725     DataIndex[] merge;
726     Group!DataIndex[] backrefed;
727     const Regex!Char re;           //regex program
728     Stream s;
729     dchar front;
730     DataIndex index;
731     DataIndex genCounter;    //merge trace counter, goes up on every dchar
732     size_t[size_t] subCounters; //a table of gen counter per sub-engine: PC -> counter
733     OpFunc[] opCacheTrue;   // pointers to Op!(IR.xyz) for each bytecode
734     OpFunc[] opCacheFalse;  // ditto
735     OpBackFunc[] opCacheBackTrue;   // ditto
736     OpBackFunc[] opCacheBackFalse;  // ditto
737     size_t threadSize;
738     size_t _refCount;
739     int matched;
740     bool exhausted;
741 
742 final:
743     pure
744     static struct State
745     {
746         Thread!DataIndex* t;
747         ThreadList!DataIndex worklist;
748         Group!DataIndex[] matches;
749 
popStateState750         bool popState(E)(E e)
751         {
752             with(e)
753             {
754                 recycle(t);
755                 t = worklist.fetch();
756                 return t != null;
757             }
758         }
759 
760     }
761 
762     static if (__traits(hasMember,Stream, "search"))
763     {
764         enum kicked = true;
765     }
766     else
767         enum kicked = false;
768 
769     static size_t getThreadSize(const ref Regex!Char re)
770     {
771         return re.ngroup
772             ? (Thread!DataIndex).sizeof + (re.ngroup-1)*(Group!DataIndex).sizeof
773             : (Thread!DataIndex).sizeof - (Group!DataIndex).sizeof;
774     }
775 
776     static size_t initialMemory(const ref Regex!Char re)
777     {
778         return getThreadSize(re)*re.threadCount + re.hotspotTableSize*size_t.sizeof
779             +4*OpFunc.sizeof*re.ir.length;
780     }
781 
782     //true if it's start of input
atStart()783     @property bool atStart(){   return index == 0; }
784 
785     //true if it's end of input
atEnd()786     @property bool atEnd(){  return index == s.lastIndex && s.atEnd; }
787 
refCount()788     override @property ref size_t refCount() @safe { return _refCount; }
789 
pattern()790     override @property ref const(Regex!Char) pattern() @safe { return re; }
791 
next()792     bool next()
793     {
794         if (!s.nextChar(front, index))
795         {
796             index =  s.lastIndex;
797             return false;
798         }
799         return true;
800     }
801 
802     static if (kicked)
803     {
search()804         bool search()
805         {
806 
807             if (!s.search(re.kickstart, front, index))
808             {
809                 index = s.lastIndex;
810                 return false;
811             }
812             return true;
813         }
814     }
815 
initExternalMemory(void[]memory)816     void initExternalMemory(void[] memory)
817     {
818         threadSize = getThreadSize(re);
819         prepareFreeList(re.threadCount, memory);
820         if (re.hotspotTableSize)
821         {
822             merge = arrayInChunk!(DataIndex)(re.hotspotTableSize, memory);
823             merge[] = 0;
824         }
825         opCacheTrue = arrayInChunk!(OpFunc)(re.ir.length, memory);
826         opCacheFalse = arrayInChunk!(OpFunc)(re.ir.length, memory);
827         opCacheBackTrue = arrayInChunk!(OpBackFunc)(re.ir.length, memory);
828         opCacheBackFalse = arrayInChunk!(OpBackFunc)(re.ir.length, memory);
829 
830         for (uint pc = 0; pc<re.ir.length; pc += re.ir[pc].length)
831         {
832         L_dispatch:
833             switch (re.ir[pc].code)
834             {
835                 foreach (e; __traits(allMembers, IR))
836                 {
837             mixin(`case IR.`~e~`:
838                     opCacheTrue[pc] = &Ops!(true).op!(IR.`~e~`);
839                     opCacheBackTrue[pc] = &BackOps!(true).op!(IR.`~e~`);
840                     opCacheFalse[pc] = &Ops!(false).op!(IR.`~e~`);
841                     opCacheBackFalse[pc] = &BackOps!(false).op!(IR.`~e~`);
842                 break L_dispatch;
843                 `);
844                 }
845             default:
846                 assert(0, "Unrecognized instruction "~re.ir[pc].mnemonic);
847             }
848         }
849     }
850 
851     override Matcher!Char rearm(in Char[] data)
852     {
853         exhausted = false;
854         matched = 0;
855         s = Stream(data);
856         return this;
857     }
858 
this()859     this()(ref const Regex!Char program, Stream stream, void[] memory)
860     {
861          // We are emplace'd to malloced memory w/o blitting T.init over it\
862          // make sure we initialize all fields explicitly
863         _refCount = 1;
864         subCounters = null;
865         backrefed = null;
866         exhausted = false;
867         matched = 0;
868         re = program;
869         s = stream;
870         initExternalMemory(memory);
871         genCounter = 0;
872     }
873 
this(ThompsonMatcher matcher,size_t lo,size_t hi,uint nGroup,Stream stream)874     this(ThompsonMatcher matcher, size_t lo, size_t hi, uint nGroup, Stream stream)
875     {
876         _refCount = 1;
877         subCounters = matcher.subCounters;
878         s = stream;
879         auto code = matcher.re.ir[lo .. hi];
880         re = matcher.re.withCode(code).withNGroup(nGroup);
881         threadSize = matcher.threadSize;
882         merge = matcher.merge;
883         freelist = matcher.freelist;
884         opCacheTrue = matcher.opCacheTrue[lo .. hi];
885         opCacheBackTrue = matcher.opCacheBackTrue[lo .. hi];
886         opCacheFalse = matcher.opCacheFalse[lo .. hi];
887         opCacheBackFalse = matcher.opCacheBackFalse[lo .. hi];
888         front = matcher.front;
889         index = matcher.index;
890     }
891 
this(BackMatcher matcher,size_t lo,size_t hi,uint nGroup,Stream stream)892     this(BackMatcher matcher, size_t lo, size_t hi, uint nGroup, Stream stream)
893     {
894         _refCount = 1;
895         subCounters = matcher.subCounters;
896         s = stream;
897         auto code = matcher.re.ir[lo .. hi];
898         re = matcher.re.withCode(code).withNGroup(nGroup);
899         threadSize = matcher.threadSize;
900         merge = matcher.merge;
901         freelist = matcher.freelist;
902         opCacheTrue = matcher.opCacheBackTrue[lo .. hi];
903         opCacheBackTrue = matcher.opCacheTrue[lo .. hi];
904         opCacheFalse = matcher.opCacheBackFalse[lo .. hi];
905         opCacheBackFalse = matcher.opCacheFalse[lo .. hi];
906         front = matcher.front;
907         index = matcher.index;
908     }
909 
fwdMatcher()910     auto fwdMatcher()(size_t lo, size_t hi, uint nGroup, size_t counter)
911     {
912         auto m = new ThompsonMatcher!(Char, Stream)(this, lo, hi, nGroup, s);
913         m.genCounter = counter;
914         return m;
915     }
916 
bwdMatcher()917     auto bwdMatcher()(size_t lo, size_t hi, uint nGroup, size_t counter)
918     {
919         alias BackLooper = typeof(s.loopBack(index));
920         auto m = new ThompsonMatcher!(Char, BackLooper)(this, lo, hi, nGroup, s.loopBack(index));
921         m.genCounter = counter;
922         m.next();
923         return m;
924     }
925 
926     override void dupTo(Matcher!Char engine, void[] memory)
927     {
928         auto thompson = cast(ThompsonMatcher) engine;
929         thompson.s = s;
930         thompson.subCounters = null;
931         thompson.front = front;
932         thompson.index = index;
933         thompson.matched = matched;
934         thompson.exhausted = exhausted;
935         thompson.initExternalMemory(memory);
936     }
937 
938     override int match(Group!DataIndex[] matches)
939     {
940         debug(std_regex_matcher)
941             writeln("------------------------------------------");
942         if (exhausted)
943         {
944             return false;
945         }
946         if (re.flags & RegexInfo.oneShot)
947         {
948             next();
949             exhausted = true;
950             return matchOneShot(matches);
951         }
952         static if (kicked)
953             if (!re.kickstart.empty)
954                 return matchImpl!(true)(matches);
955         return matchImpl!(false)(matches);
956     }
957 
958     //match the input and fill matches
matchImpl(bool withSearch)959     int matchImpl(bool withSearch)(Group!DataIndex[] matches)
960     {
961         if (!matched && clist.empty)
962         {
963            static if (withSearch)
964                 search();
965            else
966                 next();
967         }
968         else//char in question is  fetched in prev call to match
969         {
970             matched = 0;
971         }
972         State state;
973         state.matches = matches;
974 
975         if (!atEnd)//if no char
976             for (;;)
977             {
978                 genCounter++;
979                 debug(std_regex_matcher)
980                 {
981                     writefln("Threaded matching threads at  %s", s[index .. s.lastIndex]);
982                     foreach (t; clist[])
983                     {
984                         assert(t);
985                         writef("pc=%s ",t.pc);
986                         write(t.matches);
987                         writeln();
988                     }
989                 }
990                 for (state.t = clist.fetch(); state.t; state.t = clist.fetch())
991                 {
992                     eval!true(&state);
993                 }
994                 //if we already have match no need to push the engine
995                 if (!matched)
996                 {
997                     state.t = createStart(index);
998                     eval!true(&state);//new thread staring at this position
999                 }
1000                 else if (nlist.empty)
1001                 {
1002                     debug(std_regex_matcher) writeln("Stopped  matching before consuming full input");
1003                     break;//not a partial match for sure
1004                 }
1005                 clist = nlist;
1006                 nlist = (ThreadList!DataIndex).init;
1007                 if (clist.tip is null)
1008                 {
1009                     static if (withSearch)
1010                     {
1011                         if (!search())
1012                             break;
1013                     }
1014                     else
1015                     {
1016                         if (!next())
1017                             break;
1018                     }
1019                 }
1020                 else if (!next())
1021                 {
1022                     if (!atEnd) return false;
1023                     exhausted = true;
1024                     break;
1025                 }
1026             }
1027 
1028         genCounter++; //increment also on each end
1029         debug(std_regex_matcher) writefln("Threaded matching threads at end");
1030         //try out all zero-width posibilities
1031         for (state.t = clist.fetch(); state.t; state.t = clist.fetch())
1032         {
1033             eval!false(&state);
1034         }
1035         if (!matched)
1036         {
1037             state.t = createStart(index);
1038             eval!false(&state);//new thread starting at end of input
1039         }
1040         if (matched)
1041         {//in case NFA found match along the way
1042          //and last possible longer alternative ultimately failed
1043             s.reset(matches[0].end);//reset to last successful match
1044             next();//and reload front character
1045             //--- here the exact state of stream was restored ---
1046             exhausted = atEnd || !(re.flags & RegexOption.global);
1047             //+ empty match advances the input
1048             if (!exhausted && matches[0].begin == matches[0].end)
1049                 next();
1050         }
1051         return matched;
1052     }
1053 
1054     /+
1055         handle succesful threads
1056     +/
1057     void finish(const(Thread!DataIndex)* t, Group!DataIndex[] matches, int code)
1058     {
1059         matches.ptr[0 .. re.ngroup] = t.matches.ptr[0 .. re.ngroup];
debug(std_regex_matcher)1060         debug(std_regex_matcher)
1061         {
1062             writef("FOUND pc=%s prog_len=%s",
1063                     t.pc, re.ir.length);
1064             if (!matches.empty)
1065                 writefln(": %s..%s", matches[0].begin, matches[0].end);
1066             foreach (v; matches)
1067                 writefln("%d .. %d", v.begin, v.end);
1068         }
1069         matched = code;
1070     }
1071 
1072     alias Ops(bool withInput) =  ThompsonOps!(ThompsonMatcher, State, withInput);
1073     alias BackOps(bool withInput) =  ThompsonOps!(BackMatcher, BackMatcher.State, withInput);
1074 
1075     /+
1076         match thread against codepoint, cutting trough all 0-width instructions
1077         and taking care of control flow, then add it to nlist
1078     +/
eval(bool withInput)1079     void eval(bool withInput)(State* state)
1080     {
1081         debug(std_regex_matcher) writeln("---- Evaluating thread");
1082         static if (withInput)
1083             while (opCacheTrue.ptr[state.t.pc](this, state)){}
1084         else
1085             while (opCacheFalse.ptr[state.t.pc](this, state)){}
1086     }
1087     enum uint RestartPc = uint.max;
1088     //match the input, evaluating IR without searching
1089     int matchOneShot(Group!DataIndex[] matches, uint startPc = 0)
1090     {
debug(std_regex_matcher)1091         debug(std_regex_matcher)
1092         {
1093             writefln("---------------single shot match ----------------- ");
1094         }
1095         alias evalFn = eval;
1096         assert(clist == (ThreadList!DataIndex).init || startPc == RestartPc); // incorrect after a partial match
1097         assert(nlist == (ThreadList!DataIndex).init || startPc == RestartPc);
1098         State state;
1099         state.matches = matches;
1100         if (!atEnd)//if no char
1101         {
debug(std_regex_matcher)1102             debug(std_regex_matcher)
1103             {
1104                 writefln("-- Threaded matching threads at  %s",  s[index .. s.lastIndex]);
1105             }
1106             if (startPc != RestartPc)
1107             {
1108                 state.t = createStart(index, startPc);
1109                 genCounter++;
1110                 evalFn!true(&state);
1111             }
1112             for (;;)
1113             {
1114                 debug(std_regex_matcher) writeln("\n-- Started iteration of main cycle");
1115                 genCounter++;
debug(std_regex_matcher)1116                 debug(std_regex_matcher)
1117                 {
1118                     foreach (t; clist[])
1119                     {
1120                         assert(t);
1121                     }
1122                 }
1123                 for (state.t = clist.fetch(); state.t; state.t = clist.fetch())
1124                 {
1125                     evalFn!true(&state);
1126                 }
1127                 if (nlist.empty)
1128                 {
1129                     debug(std_regex_matcher) writeln("Stopped  matching before consuming full input");
1130                     break;//not a partial match for sure
1131                 }
1132                 clist = nlist;
1133                 nlist = (ThreadList!DataIndex).init;
1134                 if (!next())
1135                     break;
1136                 debug(std_regex_matcher) writeln("-- Ended iteration of main cycle\n");
1137             }
1138         }
1139         genCounter++; //increment also on each end
1140         debug(std_regex_matcher) writefln("-- Matching threads at end");
1141         //try out all zero-width posibilities
1142         for (state.t = clist.fetch(); state.t; state.t = clist.fetch())
1143         {
1144             evalFn!false(&state);
1145         }
1146         if (!matched)
1147         {
1148             state.t = createStart(index, startPc);
1149             evalFn!false(&state);
1150         }
1151         return matched;
1152     }
1153 
1154     //get a dirty recycled Thread
1155     Thread!DataIndex* allocate()
1156     {
1157         assert(freelist, "not enough preallocated memory");
1158         Thread!DataIndex* t = freelist;
1159         freelist = freelist.next;
1160         return t;
1161     }
1162 
1163     //link memory into a free list of Threads
prepareFreeList(size_t size,ref void[]memory)1164     void prepareFreeList(size_t size, ref void[] memory)
1165     {
1166         void[] mem = memory[0 .. threadSize*size];
1167         memory = memory[threadSize * size .. $];
1168         freelist = cast(Thread!DataIndex*)&mem[0];
1169         size_t i;
1170         for (i = threadSize; i < threadSize*size; i += threadSize)
1171             (cast(Thread!DataIndex*)&mem[i-threadSize]).next = cast(Thread!DataIndex*)&mem[i];
1172         (cast(Thread!DataIndex*)&mem[i-threadSize]).next = null;
1173     }
1174 
1175     //dispose a thread
1176     void recycle(Thread!DataIndex* t)
1177     {
1178         t.next = freelist;
1179         freelist = t;
1180     }
1181 
1182     //dispose list of threads
1183     void recycle(ref ThreadList!DataIndex list)
1184     {
1185         if (list.tip)
1186         {
1187             // just put this head-tail list in front of freelist
1188             list.toe.next = freelist;
1189             freelist = list.tip;
1190             list = list.init;
1191         }
1192     }
1193 
1194     //creates a copy of master thread with given pc
1195     Thread!DataIndex* fork(Thread!DataIndex* master, uint pc, uint counter)
1196     {
1197         auto t = allocate();
1198         t.matches.ptr[0 .. re.ngroup] = master.matches.ptr[0 .. re.ngroup];
1199         t.pc = pc;
1200         t.counter = counter;
1201         t.uopCounter = 0;
1202         return t;
1203     }
1204 
1205     //creates a start thread
1206     Thread!DataIndex* createStart(DataIndex index, uint pc = 0)
1207     {
1208         auto t = allocate();
1209         t.matches.ptr[0 .. re.ngroup] = (Group!DataIndex).init;
1210         t.matches[0].begin = index;
1211         t.pc = pc;
1212         t.counter = 0;
1213         t.uopCounter = 0;
1214         return t;
1215     }
1216 }
1217