1*627f7eb2Smrg /*
2*627f7eb2Smrg Implementation of backtracking std.regex engine.
3*627f7eb2Smrg Contains both compile-time and run-time versions.
4*627f7eb2Smrg */
5*627f7eb2Smrg module std.regex.internal.backtracking;
6*627f7eb2Smrg
7*627f7eb2Smrg package(std.regex):
8*627f7eb2Smrg
9*627f7eb2Smrg import core.stdc.stdlib, std.range.primitives, std.traits, std.typecons;
10*627f7eb2Smrg import std.regex.internal.ir;
11*627f7eb2Smrg
12*627f7eb2Smrg /+
13*627f7eb2Smrg BacktrackingMatcher implements backtracking scheme of matching
14*627f7eb2Smrg regular expressions.
15*627f7eb2Smrg +/
BacktrackingMatcher(bool CTregex)16*627f7eb2Smrg template BacktrackingMatcher(bool CTregex)
17*627f7eb2Smrg {
18*627f7eb2Smrg @trusted struct BacktrackingMatcher(Char, Stream = Input!Char)
19*627f7eb2Smrg if (is(Char : dchar))
20*627f7eb2Smrg {
21*627f7eb2Smrg alias DataIndex = Stream.DataIndex;
22*627f7eb2Smrg struct State
23*627f7eb2Smrg {//top bit in pc is set if saved along with matches
24*627f7eb2Smrg DataIndex index;
25*627f7eb2Smrg uint pc, counter, infiniteNesting;
26*627f7eb2Smrg }
27*627f7eb2Smrg static assert(State.sizeof % size_t.sizeof == 0);
28*627f7eb2Smrg enum stateSize = State.sizeof / size_t.sizeof;
29*627f7eb2Smrg enum initialStack = 1 << 11; // items in a block of segmented stack
30*627f7eb2Smrg alias String = const(Char)[];
31*627f7eb2Smrg alias RegEx = Regex!Char;
32*627f7eb2Smrg alias MatchFn = bool function (ref BacktrackingMatcher!(Char, Stream));
33*627f7eb2Smrg RegEx re; //regex program
34*627f7eb2Smrg static if (CTregex)
35*627f7eb2Smrg MatchFn nativeFn; //native code for that program
36*627f7eb2Smrg //Stream state
37*627f7eb2Smrg Stream s;
38*627f7eb2Smrg DataIndex index;
39*627f7eb2Smrg dchar front;
40*627f7eb2Smrg bool exhausted;
41*627f7eb2Smrg //backtracking machine state
42*627f7eb2Smrg uint pc, counter;
43*627f7eb2Smrg DataIndex lastState = 0; //top of state stack
44*627f7eb2Smrg static if (!CTregex)
45*627f7eb2Smrg uint infiniteNesting;
46*627f7eb2Smrg size_t[] memory;
47*627f7eb2Smrg Trace[] merge;
48*627f7eb2Smrg static struct Trace
49*627f7eb2Smrg {
50*627f7eb2Smrg ulong mask;
51*627f7eb2Smrg size_t offset;
52*627f7eb2Smrg
53*627f7eb2Smrg bool mark(size_t idx)
54*627f7eb2Smrg {
55*627f7eb2Smrg immutable d = idx - offset;
56*627f7eb2Smrg if (d < 64) // including overflow
57*627f7eb2Smrg {
58*627f7eb2Smrg immutable p = mask & (1UL << d);
59*627f7eb2Smrg mask |= 1UL << d;
60*627f7eb2Smrg return p != 0;
61*627f7eb2Smrg }
62*627f7eb2Smrg else
63*627f7eb2Smrg {
64*627f7eb2Smrg offset = idx;
65*627f7eb2Smrg mask = 1;
66*627f7eb2Smrg return false;
67*627f7eb2Smrg }
68*627f7eb2Smrg }
69*627f7eb2Smrg }
70*627f7eb2Smrg //local slice of matches, global for backref
71*627f7eb2Smrg Group!DataIndex[] matches, backrefed;
72*627f7eb2Smrg
73*627f7eb2Smrg static if (__traits(hasMember,Stream, "search"))
74*627f7eb2Smrg {
75*627f7eb2Smrg enum kicked = true;
76*627f7eb2Smrg }
77*627f7eb2Smrg else
78*627f7eb2Smrg enum kicked = false;
79*627f7eb2Smrg
80*627f7eb2Smrg static size_t initialMemory(const ref RegEx re)
81*627f7eb2Smrg {
82*627f7eb2Smrg return stackSize(re)*size_t.sizeof + re.hotspotTableSize*Trace.sizeof;
83*627f7eb2Smrg }
84*627f7eb2Smrg
85*627f7eb2Smrg static size_t stackSize(const ref RegEx re)
86*627f7eb2Smrg {
87*627f7eb2Smrg size_t itemSize = stateSize
88*627f7eb2Smrg + re.ngroup * (Group!DataIndex).sizeof / size_t.sizeof;
89*627f7eb2Smrg return initialStack * itemSize + 2;
90*627f7eb2Smrg }
91*627f7eb2Smrg
92*627f7eb2Smrg @property bool atStart(){ return index == 0; }
93*627f7eb2Smrg
94*627f7eb2Smrg @property bool atEnd(){ return index == s.lastIndex && s.atEnd; }
95*627f7eb2Smrg
96*627f7eb2Smrg void next()
97*627f7eb2Smrg {
98*627f7eb2Smrg if (!s.nextChar(front, index))
99*627f7eb2Smrg index = s.lastIndex;
100*627f7eb2Smrg }
101*627f7eb2Smrg
102*627f7eb2Smrg void search()
103*627f7eb2Smrg {
104*627f7eb2Smrg static if (kicked)
105*627f7eb2Smrg {
106*627f7eb2Smrg if (!s.search(re.kickstart, front, index))
107*627f7eb2Smrg {
108*627f7eb2Smrg index = s.lastIndex;
109*627f7eb2Smrg }
110*627f7eb2Smrg }
111*627f7eb2Smrg else
112*627f7eb2Smrg next();
113*627f7eb2Smrg }
114*627f7eb2Smrg
115*627f7eb2Smrg //
116*627f7eb2Smrg void newStack()
117*627f7eb2Smrg {
118*627f7eb2Smrg auto chunk = mallocArray!(size_t)(stackSize(re));
119*627f7eb2Smrg chunk[0] = cast(size_t)(memory.ptr);
120*627f7eb2Smrg chunk[1] = lastState;
121*627f7eb2Smrg memory = chunk[2..$];
122*627f7eb2Smrg lastState = 0;
123*627f7eb2Smrg }
124*627f7eb2Smrg
125*627f7eb2Smrg bool prevStack()
126*627f7eb2Smrg {
127*627f7eb2Smrg // pointer to previous block
128*627f7eb2Smrg size_t* prev = cast(size_t*) memory.ptr[-2];
129*627f7eb2Smrg if (!prev)
130*627f7eb2Smrg {
131*627f7eb2Smrg // The last segment is freed in RegexMatch
132*627f7eb2Smrg return false;
133*627f7eb2Smrg }
134*627f7eb2Smrg else
135*627f7eb2Smrg {
136*627f7eb2Smrg import core.stdc.stdlib : free;
137*627f7eb2Smrg // memory used in previous block
138*627f7eb2Smrg size_t size = memory.ptr[-1];
139*627f7eb2Smrg free(memory.ptr-2);
140*627f7eb2Smrg memory = prev[0 .. size];
141*627f7eb2Smrg lastState = size;
142*627f7eb2Smrg return true;
143*627f7eb2Smrg }
144*627f7eb2Smrg }
145*627f7eb2Smrg
146*627f7eb2Smrg void initExternalMemory(void[] memBlock)
147*627f7eb2Smrg {
148*627f7eb2Smrg merge = arrayInChunk!(Trace)(re.hotspotTableSize, memBlock);
149*627f7eb2Smrg merge[] = Trace.init;
150*627f7eb2Smrg memory = cast(size_t[]) memBlock;
151*627f7eb2Smrg memory[0] = 0; // hidden pointer
152*627f7eb2Smrg memory[1] = 0; // used size
153*627f7eb2Smrg memory = memory[2..$];
154*627f7eb2Smrg }
155*627f7eb2Smrg
156*627f7eb2Smrg void initialize(ref RegEx program, Stream stream, void[] memBlock)
157*627f7eb2Smrg {
158*627f7eb2Smrg re = program;
159*627f7eb2Smrg s = stream;
160*627f7eb2Smrg exhausted = false;
161*627f7eb2Smrg initExternalMemory(memBlock);
162*627f7eb2Smrg backrefed = null;
163*627f7eb2Smrg }
164*627f7eb2Smrg
165*627f7eb2Smrg auto dupTo(void[] memory)
166*627f7eb2Smrg {
167*627f7eb2Smrg typeof(this) tmp = this;
168*627f7eb2Smrg tmp.initExternalMemory(memory);
169*627f7eb2Smrg return tmp;
170*627f7eb2Smrg }
171*627f7eb2Smrg
172*627f7eb2Smrg this(ref RegEx program, Stream stream, void[] memBlock, dchar ch, DataIndex idx)
173*627f7eb2Smrg {
174*627f7eb2Smrg initialize(program, stream, memBlock);
175*627f7eb2Smrg front = ch;
176*627f7eb2Smrg index = idx;
177*627f7eb2Smrg }
178*627f7eb2Smrg
179*627f7eb2Smrg this(ref RegEx program, Stream stream, void[] memBlock)
180*627f7eb2Smrg {
181*627f7eb2Smrg initialize(program, stream, memBlock);
182*627f7eb2Smrg next();
183*627f7eb2Smrg }
184*627f7eb2Smrg
185*627f7eb2Smrg auto fwdMatcher(ref BacktrackingMatcher matcher, void[] memBlock)
186*627f7eb2Smrg {
187*627f7eb2Smrg alias BackMatcherTempl = .BacktrackingMatcher!(CTregex);
188*627f7eb2Smrg alias BackMatcher = BackMatcherTempl!(Char, Stream);
189*627f7eb2Smrg auto fwdMatcher = BackMatcher(matcher.re, s, memBlock, front, index);
190*627f7eb2Smrg return fwdMatcher;
191*627f7eb2Smrg }
192*627f7eb2Smrg
193*627f7eb2Smrg auto bwdMatcher(ref BacktrackingMatcher matcher, void[] memBlock)
194*627f7eb2Smrg {
195*627f7eb2Smrg alias BackMatcherTempl = .BacktrackingMatcher!(CTregex);
196*627f7eb2Smrg alias BackMatcher = BackMatcherTempl!(Char, typeof(s.loopBack(index)));
197*627f7eb2Smrg auto fwdMatcher =
198*627f7eb2Smrg BackMatcher(matcher.re, s.loopBack(index), memBlock);
199*627f7eb2Smrg return fwdMatcher;
200*627f7eb2Smrg }
201*627f7eb2Smrg
202*627f7eb2Smrg //
203*627f7eb2Smrg int matchFinalize()
204*627f7eb2Smrg {
205*627f7eb2Smrg immutable start = index;
206*627f7eb2Smrg immutable val = matchImpl();
207*627f7eb2Smrg if (val)
208*627f7eb2Smrg {//stream is updated here
209*627f7eb2Smrg matches[0].begin = start;
210*627f7eb2Smrg matches[0].end = index;
211*627f7eb2Smrg if (!(re.flags & RegexOption.global) || atEnd)
212*627f7eb2Smrg exhausted = true;
213*627f7eb2Smrg if (start == index)//empty match advances input
214*627f7eb2Smrg next();
215*627f7eb2Smrg return val;
216*627f7eb2Smrg }
217*627f7eb2Smrg else
218*627f7eb2Smrg return 0;
219*627f7eb2Smrg }
220*627f7eb2Smrg
221*627f7eb2Smrg //lookup next match, fill matches with indices into input
222*627f7eb2Smrg int match(Group!DataIndex[] matches)
223*627f7eb2Smrg {
224*627f7eb2Smrg debug(std_regex_matcher)
225*627f7eb2Smrg {
226*627f7eb2Smrg writeln("------------------------------------------");
227*627f7eb2Smrg }
228*627f7eb2Smrg if (exhausted) //all matches collected
229*627f7eb2Smrg return false;
230*627f7eb2Smrg this.matches = matches;
231*627f7eb2Smrg if (re.flags & RegexInfo.oneShot)
232*627f7eb2Smrg {
233*627f7eb2Smrg exhausted = true;
234*627f7eb2Smrg const DataIndex start = index;
235*627f7eb2Smrg immutable m = matchImpl();
236*627f7eb2Smrg if (m)
237*627f7eb2Smrg {
238*627f7eb2Smrg matches[0].begin = start;
239*627f7eb2Smrg matches[0].end = index;
240*627f7eb2Smrg }
241*627f7eb2Smrg return m;
242*627f7eb2Smrg }
243*627f7eb2Smrg static if (kicked)
244*627f7eb2Smrg {
245*627f7eb2Smrg if (!re.kickstart.empty)
246*627f7eb2Smrg {
247*627f7eb2Smrg for (;;)
248*627f7eb2Smrg {
249*627f7eb2Smrg immutable val = matchFinalize();
250*627f7eb2Smrg if (val)
251*627f7eb2Smrg return val;
252*627f7eb2Smrg else
253*627f7eb2Smrg {
254*627f7eb2Smrg if (atEnd)
255*627f7eb2Smrg break;
256*627f7eb2Smrg search();
257*627f7eb2Smrg if (atEnd)
258*627f7eb2Smrg {
259*627f7eb2Smrg exhausted = true;
260*627f7eb2Smrg return matchFinalize();
261*627f7eb2Smrg }
262*627f7eb2Smrg }
263*627f7eb2Smrg }
264*627f7eb2Smrg exhausted = true;
265*627f7eb2Smrg return 0; //early return
266*627f7eb2Smrg }
267*627f7eb2Smrg }
268*627f7eb2Smrg //no search available - skip a char at a time
269*627f7eb2Smrg for (;;)
270*627f7eb2Smrg {
271*627f7eb2Smrg immutable val = matchFinalize();
272*627f7eb2Smrg if (val)
273*627f7eb2Smrg return val;
274*627f7eb2Smrg else
275*627f7eb2Smrg {
276*627f7eb2Smrg if (atEnd)
277*627f7eb2Smrg break;
278*627f7eb2Smrg next();
279*627f7eb2Smrg if (atEnd)
280*627f7eb2Smrg {
281*627f7eb2Smrg exhausted = true;
282*627f7eb2Smrg return matchFinalize();
283*627f7eb2Smrg }
284*627f7eb2Smrg }
285*627f7eb2Smrg }
286*627f7eb2Smrg exhausted = true;
287*627f7eb2Smrg return 0;
288*627f7eb2Smrg }
289*627f7eb2Smrg
290*627f7eb2Smrg /+
291*627f7eb2Smrg match subexpression against input,
292*627f7eb2Smrg results are stored in matches
293*627f7eb2Smrg +/
294*627f7eb2Smrg int matchImpl()
295*627f7eb2Smrg {
296*627f7eb2Smrg static if (CTregex && is(typeof(nativeFn(this))))
297*627f7eb2Smrg {
298*627f7eb2Smrg debug(std_regex_ctr) writeln("using C-T matcher");
299*627f7eb2Smrg return nativeFn(this);
300*627f7eb2Smrg }
301*627f7eb2Smrg else
302*627f7eb2Smrg {
303*627f7eb2Smrg pc = 0;
304*627f7eb2Smrg counter = 0;
305*627f7eb2Smrg lastState = 0;
306*627f7eb2Smrg matches[] = Group!DataIndex.init;
307*627f7eb2Smrg auto start = s._index;
308*627f7eb2Smrg debug(std_regex_matcher)
309*627f7eb2Smrg writeln("Try match starting at ", s[index .. s.lastIndex]);
310*627f7eb2Smrg for (;;)
311*627f7eb2Smrg {
312*627f7eb2Smrg debug(std_regex_matcher)
313*627f7eb2Smrg writefln("PC: %s\tCNT: %s\t%s \tfront: %s src: %s",
314*627f7eb2Smrg pc, counter, disassemble(re.ir, pc, re.dict),
315*627f7eb2Smrg front, s._index);
316*627f7eb2Smrg switch (re.ir[pc].code)
317*627f7eb2Smrg {
318*627f7eb2Smrg case IR.OrChar://assumes IRL!(OrChar) == 1
319*627f7eb2Smrg if (atEnd)
320*627f7eb2Smrg goto L_backtrack;
321*627f7eb2Smrg uint len = re.ir[pc].sequence;
322*627f7eb2Smrg uint end = pc + len;
323*627f7eb2Smrg if (re.ir[pc].data != front && re.ir[pc+1].data != front)
324*627f7eb2Smrg {
325*627f7eb2Smrg for (pc = pc+2; pc < end; pc++)
326*627f7eb2Smrg if (re.ir[pc].data == front)
327*627f7eb2Smrg break;
328*627f7eb2Smrg if (pc == end)
329*627f7eb2Smrg goto L_backtrack;
330*627f7eb2Smrg }
331*627f7eb2Smrg pc = end;
332*627f7eb2Smrg next();
333*627f7eb2Smrg break;
334*627f7eb2Smrg case IR.Char:
335*627f7eb2Smrg if (atEnd || front != re.ir[pc].data)
336*627f7eb2Smrg goto L_backtrack;
337*627f7eb2Smrg pc += IRL!(IR.Char);
338*627f7eb2Smrg next();
339*627f7eb2Smrg break;
340*627f7eb2Smrg case IR.Any:
341*627f7eb2Smrg if (atEnd)
342*627f7eb2Smrg goto L_backtrack;
343*627f7eb2Smrg pc += IRL!(IR.Any);
344*627f7eb2Smrg next();
345*627f7eb2Smrg break;
346*627f7eb2Smrg case IR.CodepointSet:
347*627f7eb2Smrg if (atEnd || !re.charsets[re.ir[pc].data].scanFor(front))
348*627f7eb2Smrg goto L_backtrack;
349*627f7eb2Smrg next();
350*627f7eb2Smrg pc += IRL!(IR.CodepointSet);
351*627f7eb2Smrg break;
352*627f7eb2Smrg case IR.Trie:
353*627f7eb2Smrg if (atEnd || !re.matchers[re.ir[pc].data][front])
354*627f7eb2Smrg goto L_backtrack;
355*627f7eb2Smrg next();
356*627f7eb2Smrg pc += IRL!(IR.Trie);
357*627f7eb2Smrg break;
358*627f7eb2Smrg case IR.Wordboundary:
359*627f7eb2Smrg dchar back;
360*627f7eb2Smrg DataIndex bi;
361*627f7eb2Smrg //at start & end of input
362*627f7eb2Smrg if (atStart && wordMatcher[front])
363*627f7eb2Smrg {
364*627f7eb2Smrg pc += IRL!(IR.Wordboundary);
365*627f7eb2Smrg break;
366*627f7eb2Smrg }
367*627f7eb2Smrg else if (atEnd && s.loopBack(index).nextChar(back, bi)
368*627f7eb2Smrg && wordMatcher[back])
369*627f7eb2Smrg {
370*627f7eb2Smrg pc += IRL!(IR.Wordboundary);
371*627f7eb2Smrg break;
372*627f7eb2Smrg }
373*627f7eb2Smrg else if (s.loopBack(index).nextChar(back, bi))
374*627f7eb2Smrg {
375*627f7eb2Smrg immutable af = wordMatcher[front];
376*627f7eb2Smrg immutable ab = wordMatcher[back];
377*627f7eb2Smrg if (af ^ ab)
378*627f7eb2Smrg {
379*627f7eb2Smrg pc += IRL!(IR.Wordboundary);
380*627f7eb2Smrg break;
381*627f7eb2Smrg }
382*627f7eb2Smrg }
383*627f7eb2Smrg goto L_backtrack;
384*627f7eb2Smrg case IR.Notwordboundary:
385*627f7eb2Smrg dchar back;
386*627f7eb2Smrg DataIndex bi;
387*627f7eb2Smrg //at start & end of input
388*627f7eb2Smrg if (atStart && wordMatcher[front])
389*627f7eb2Smrg goto L_backtrack;
390*627f7eb2Smrg else if (atEnd && s.loopBack(index).nextChar(back, bi)
391*627f7eb2Smrg && wordMatcher[back])
392*627f7eb2Smrg goto L_backtrack;
393*627f7eb2Smrg else if (s.loopBack(index).nextChar(back, bi))
394*627f7eb2Smrg {
395*627f7eb2Smrg immutable af = wordMatcher[front];
396*627f7eb2Smrg immutable ab = wordMatcher[back];
397*627f7eb2Smrg if (af ^ ab)
398*627f7eb2Smrg goto L_backtrack;
399*627f7eb2Smrg }
400*627f7eb2Smrg pc += IRL!(IR.Wordboundary);
401*627f7eb2Smrg break;
402*627f7eb2Smrg case IR.Bof:
403*627f7eb2Smrg if (atStart)
404*627f7eb2Smrg pc += IRL!(IR.Bol);
405*627f7eb2Smrg else
406*627f7eb2Smrg goto L_backtrack;
407*627f7eb2Smrg break;
408*627f7eb2Smrg case IR.Bol:
409*627f7eb2Smrg dchar back;
410*627f7eb2Smrg DataIndex bi;
411*627f7eb2Smrg if (atStart)
412*627f7eb2Smrg pc += IRL!(IR.Bol);
413*627f7eb2Smrg else if (s.loopBack(index).nextChar(back,bi)
414*627f7eb2Smrg && endOfLine(back, front == '\n'))
415*627f7eb2Smrg {
416*627f7eb2Smrg pc += IRL!(IR.Bol);
417*627f7eb2Smrg }
418*627f7eb2Smrg else
419*627f7eb2Smrg goto L_backtrack;
420*627f7eb2Smrg break;
421*627f7eb2Smrg case IR.Eof:
422*627f7eb2Smrg if (atEnd)
423*627f7eb2Smrg pc += IRL!(IR.Eol);
424*627f7eb2Smrg else
425*627f7eb2Smrg goto L_backtrack;
426*627f7eb2Smrg break;
427*627f7eb2Smrg case IR.Eol:
428*627f7eb2Smrg dchar back;
429*627f7eb2Smrg DataIndex bi;
430*627f7eb2Smrg debug(std_regex_matcher) writefln("EOL (front 0x%x) %s", front, s[index .. s.lastIndex]);
431*627f7eb2Smrg //no matching inside \r\n
432*627f7eb2Smrg if (atEnd || (endOfLine(front, s.loopBack(index).nextChar(back,bi)
433*627f7eb2Smrg && back == '\r')))
434*627f7eb2Smrg {
435*627f7eb2Smrg pc += IRL!(IR.Eol);
436*627f7eb2Smrg }
437*627f7eb2Smrg else
438*627f7eb2Smrg goto L_backtrack;
439*627f7eb2Smrg break;
440*627f7eb2Smrg case IR.InfiniteStart, IR.InfiniteQStart:
441*627f7eb2Smrg pc += re.ir[pc].data + IRL!(IR.InfiniteStart);
442*627f7eb2Smrg //now pc is at end IR.Infinite(Q)End
443*627f7eb2Smrg uint len = re.ir[pc].data;
444*627f7eb2Smrg if (re.ir[pc].code == IR.InfiniteEnd)
445*627f7eb2Smrg {
446*627f7eb2Smrg pushState(pc+IRL!(IR.InfiniteEnd), counter);
447*627f7eb2Smrg pc -= len;
448*627f7eb2Smrg }
449*627f7eb2Smrg else
450*627f7eb2Smrg {
451*627f7eb2Smrg pushState(pc - len, counter);
452*627f7eb2Smrg pc += IRL!(IR.InfiniteEnd);
453*627f7eb2Smrg }
454*627f7eb2Smrg break;
455*627f7eb2Smrg case IR.InfiniteBloomStart:
456*627f7eb2Smrg pc += re.ir[pc].data + IRL!(IR.InfiniteBloomStart);
457*627f7eb2Smrg //now pc is at end IR.InfiniteBloomEnd
458*627f7eb2Smrg immutable len = re.ir[pc].data;
459*627f7eb2Smrg immutable filterIdx = re.ir[pc+2].raw;
460*627f7eb2Smrg if (re.filters[filterIdx][front])
461*627f7eb2Smrg pushState(pc+IRL!(IR.InfiniteBloomEnd), counter);
462*627f7eb2Smrg pc -= len;
463*627f7eb2Smrg break;
464*627f7eb2Smrg case IR.RepeatStart, IR.RepeatQStart:
465*627f7eb2Smrg pc += re.ir[pc].data + IRL!(IR.RepeatStart);
466*627f7eb2Smrg break;
467*627f7eb2Smrg case IR.RepeatEnd:
468*627f7eb2Smrg case IR.RepeatQEnd:
469*627f7eb2Smrg if (merge[re.ir[pc + 1].raw+counter].mark(index))
470*627f7eb2Smrg {
471*627f7eb2Smrg // merged!
472*627f7eb2Smrg goto L_backtrack;
473*627f7eb2Smrg }
474*627f7eb2Smrg //len, step, min, max
475*627f7eb2Smrg immutable len = re.ir[pc].data;
476*627f7eb2Smrg immutable step = re.ir[pc+2].raw;
477*627f7eb2Smrg immutable min = re.ir[pc+3].raw;
478*627f7eb2Smrg immutable max = re.ir[pc+4].raw;
479*627f7eb2Smrg if (counter < min)
480*627f7eb2Smrg {
481*627f7eb2Smrg counter += step;
482*627f7eb2Smrg pc -= len;
483*627f7eb2Smrg }
484*627f7eb2Smrg else if (counter < max)
485*627f7eb2Smrg {
486*627f7eb2Smrg if (re.ir[pc].code == IR.RepeatEnd)
487*627f7eb2Smrg {
488*627f7eb2Smrg pushState(pc + IRL!(IR.RepeatEnd), counter%step);
489*627f7eb2Smrg counter += step;
490*627f7eb2Smrg pc -= len;
491*627f7eb2Smrg }
492*627f7eb2Smrg else
493*627f7eb2Smrg {
494*627f7eb2Smrg pushState(pc - len, counter + step);
495*627f7eb2Smrg counter = counter%step;
496*627f7eb2Smrg pc += IRL!(IR.RepeatEnd);
497*627f7eb2Smrg }
498*627f7eb2Smrg }
499*627f7eb2Smrg else
500*627f7eb2Smrg {
501*627f7eb2Smrg counter = counter%step;
502*627f7eb2Smrg pc += IRL!(IR.RepeatEnd);
503*627f7eb2Smrg }
504*627f7eb2Smrg break;
505*627f7eb2Smrg case IR.InfiniteEnd:
506*627f7eb2Smrg case IR.InfiniteQEnd:
507*627f7eb2Smrg debug(std_regex_matcher) writeln("Infinited nesting:", infiniteNesting);
508*627f7eb2Smrg if (merge[re.ir[pc + 1].raw+counter].mark(index))
509*627f7eb2Smrg {
510*627f7eb2Smrg // merged!
511*627f7eb2Smrg goto L_backtrack;
512*627f7eb2Smrg }
513*627f7eb2Smrg immutable len = re.ir[pc].data;
514*627f7eb2Smrg if (re.ir[pc].code == IR.InfiniteEnd)
515*627f7eb2Smrg {
516*627f7eb2Smrg pushState(pc + IRL!(IR.InfiniteEnd), counter);
517*627f7eb2Smrg pc -= len;
518*627f7eb2Smrg }
519*627f7eb2Smrg else
520*627f7eb2Smrg {
521*627f7eb2Smrg pushState(pc-len, counter);
522*627f7eb2Smrg pc += IRL!(IR.InfiniteEnd);
523*627f7eb2Smrg }
524*627f7eb2Smrg break;
525*627f7eb2Smrg case IR.InfiniteBloomEnd:
526*627f7eb2Smrg debug(std_regex_matcher) writeln("Infinited nesting:", infiniteNesting);
527*627f7eb2Smrg if (merge[re.ir[pc + 1].raw+counter].mark(index))
528*627f7eb2Smrg {
529*627f7eb2Smrg // merged!
530*627f7eb2Smrg goto L_backtrack;
531*627f7eb2Smrg }
532*627f7eb2Smrg immutable len = re.ir[pc].data;
533*627f7eb2Smrg immutable filterIdx = re.ir[pc+2].raw;
534*627f7eb2Smrg if (re.filters[filterIdx][front])
535*627f7eb2Smrg {
536*627f7eb2Smrg infiniteNesting--;
537*627f7eb2Smrg pushState(pc + IRL!(IR.InfiniteBloomEnd), counter);
538*627f7eb2Smrg infiniteNesting++;
539*627f7eb2Smrg }
540*627f7eb2Smrg pc -= len;
541*627f7eb2Smrg break;
542*627f7eb2Smrg case IR.OrEnd:
543*627f7eb2Smrg if (merge[re.ir[pc + 1].raw+counter].mark(index))
544*627f7eb2Smrg {
545*627f7eb2Smrg // merged!
546*627f7eb2Smrg goto L_backtrack;
547*627f7eb2Smrg }
548*627f7eb2Smrg pc += IRL!(IR.OrEnd);
549*627f7eb2Smrg break;
550*627f7eb2Smrg case IR.OrStart:
551*627f7eb2Smrg pc += IRL!(IR.OrStart);
552*627f7eb2Smrg goto case;
553*627f7eb2Smrg case IR.Option:
554*627f7eb2Smrg immutable len = re.ir[pc].data;
555*627f7eb2Smrg if (re.ir[pc+len].code == IR.GotoEndOr)//not a last one
556*627f7eb2Smrg {
557*627f7eb2Smrg pushState(pc + len + IRL!(IR.Option), counter); //remember 2nd branch
558*627f7eb2Smrg }
559*627f7eb2Smrg pc += IRL!(IR.Option);
560*627f7eb2Smrg break;
561*627f7eb2Smrg case IR.GotoEndOr:
562*627f7eb2Smrg pc = pc + re.ir[pc].data + IRL!(IR.GotoEndOr);
563*627f7eb2Smrg break;
564*627f7eb2Smrg case IR.GroupStart:
565*627f7eb2Smrg immutable n = re.ir[pc].data;
566*627f7eb2Smrg matches[n].begin = index;
567*627f7eb2Smrg debug(std_regex_matcher) writefln("IR group #%u starts at %u", n, index);
568*627f7eb2Smrg pc += IRL!(IR.GroupStart);
569*627f7eb2Smrg break;
570*627f7eb2Smrg case IR.GroupEnd:
571*627f7eb2Smrg immutable n = re.ir[pc].data;
572*627f7eb2Smrg matches[n].end = index;
573*627f7eb2Smrg debug(std_regex_matcher) writefln("IR group #%u ends at %u", n, index);
574*627f7eb2Smrg pc += IRL!(IR.GroupEnd);
575*627f7eb2Smrg break;
576*627f7eb2Smrg case IR.LookaheadStart:
577*627f7eb2Smrg case IR.NeglookaheadStart:
578*627f7eb2Smrg immutable len = re.ir[pc].data;
579*627f7eb2Smrg auto save = index;
580*627f7eb2Smrg immutable ms = re.ir[pc+1].raw, me = re.ir[pc+2].raw;
581*627f7eb2Smrg auto mem = malloc(initialMemory(re))[0 .. initialMemory(re)];
582*627f7eb2Smrg scope(exit) free(mem.ptr);
583*627f7eb2Smrg static if (Stream.isLoopback)
584*627f7eb2Smrg {
585*627f7eb2Smrg auto matcher = bwdMatcher(this, mem);
586*627f7eb2Smrg }
587*627f7eb2Smrg else
588*627f7eb2Smrg {
589*627f7eb2Smrg auto matcher = fwdMatcher(this, mem);
590*627f7eb2Smrg }
591*627f7eb2Smrg matcher.matches = matches[ms .. me];
592*627f7eb2Smrg matcher.backrefed = backrefed.empty ? matches : backrefed;
593*627f7eb2Smrg matcher.re.ir = re.ir[
594*627f7eb2Smrg pc+IRL!(IR.LookaheadStart) .. pc+IRL!(IR.LookaheadStart)+len+IRL!(IR.LookaheadEnd)
595*627f7eb2Smrg ];
596*627f7eb2Smrg immutable match = (matcher.matchImpl() != 0) ^ (re.ir[pc].code == IR.NeglookaheadStart);
597*627f7eb2Smrg s.reset(save);
598*627f7eb2Smrg next();
599*627f7eb2Smrg if (!match)
600*627f7eb2Smrg goto L_backtrack;
601*627f7eb2Smrg else
602*627f7eb2Smrg {
603*627f7eb2Smrg pc += IRL!(IR.LookaheadStart)+len+IRL!(IR.LookaheadEnd);
604*627f7eb2Smrg }
605*627f7eb2Smrg break;
606*627f7eb2Smrg case IR.LookbehindStart:
607*627f7eb2Smrg case IR.NeglookbehindStart:
608*627f7eb2Smrg immutable len = re.ir[pc].data;
609*627f7eb2Smrg immutable ms = re.ir[pc+1].raw, me = re.ir[pc+2].raw;
610*627f7eb2Smrg auto mem = malloc(initialMemory(re))[0 .. initialMemory(re)];
611*627f7eb2Smrg scope(exit) free(mem.ptr);
612*627f7eb2Smrg static if (Stream.isLoopback)
613*627f7eb2Smrg {
614*627f7eb2Smrg alias Matcher = BacktrackingMatcher!(Char, Stream);
615*627f7eb2Smrg auto matcher = Matcher(re, s, mem, front, index);
616*627f7eb2Smrg }
617*627f7eb2Smrg else
618*627f7eb2Smrg {
619*627f7eb2Smrg alias Matcher = BacktrackingMatcher!(Char, typeof(s.loopBack(index)));
620*627f7eb2Smrg auto matcher = Matcher(re, s.loopBack(index), mem);
621*627f7eb2Smrg }
622*627f7eb2Smrg matcher.matches = matches[ms .. me];
623*627f7eb2Smrg matcher.re.ir = re.ir[
624*627f7eb2Smrg pc + IRL!(IR.LookbehindStart) .. pc + IRL!(IR.LookbehindStart) + len + IRL!(IR.LookbehindEnd)
625*627f7eb2Smrg ];
626*627f7eb2Smrg matcher.backrefed = backrefed.empty ? matches : backrefed;
627*627f7eb2Smrg immutable match = (matcher.matchImpl() != 0) ^ (re.ir[pc].code == IR.NeglookbehindStart);
628*627f7eb2Smrg if (!match)
629*627f7eb2Smrg goto L_backtrack;
630*627f7eb2Smrg else
631*627f7eb2Smrg {
632*627f7eb2Smrg pc += IRL!(IR.LookbehindStart)+len+IRL!(IR.LookbehindEnd);
633*627f7eb2Smrg }
634*627f7eb2Smrg break;
635*627f7eb2Smrg case IR.Backref:
636*627f7eb2Smrg immutable n = re.ir[pc].data;
637*627f7eb2Smrg auto referenced = re.ir[pc].localRef
638*627f7eb2Smrg ? s[matches[n].begin .. matches[n].end]
639*627f7eb2Smrg : s[backrefed[n].begin .. backrefed[n].end];
640*627f7eb2Smrg while (!atEnd && !referenced.empty && front == referenced.front)
641*627f7eb2Smrg {
642*627f7eb2Smrg next();
643*627f7eb2Smrg referenced.popFront();
644*627f7eb2Smrg }
645*627f7eb2Smrg if (referenced.empty)
646*627f7eb2Smrg pc++;
647*627f7eb2Smrg else
648*627f7eb2Smrg goto L_backtrack;
649*627f7eb2Smrg break;
650*627f7eb2Smrg case IR.Nop:
651*627f7eb2Smrg pc += IRL!(IR.Nop);
652*627f7eb2Smrg break;
653*627f7eb2Smrg case IR.LookaheadEnd:
654*627f7eb2Smrg case IR.NeglookaheadEnd:
655*627f7eb2Smrg case IR.LookbehindEnd:
656*627f7eb2Smrg case IR.NeglookbehindEnd:
657*627f7eb2Smrg case IR.End:
658*627f7eb2Smrg // cleanup stale stack blocks if any
659*627f7eb2Smrg while (prevStack()) {}
660*627f7eb2Smrg return re.ir[pc].data;
661*627f7eb2Smrg default:
662*627f7eb2Smrg debug printBytecode(re.ir[0..$]);
663*627f7eb2Smrg assert(0);
664*627f7eb2Smrg L_backtrack:
665*627f7eb2Smrg if (!popState())
666*627f7eb2Smrg {
667*627f7eb2Smrg s.reset(start);
668*627f7eb2Smrg return 0;
669*627f7eb2Smrg }
670*627f7eb2Smrg }
671*627f7eb2Smrg }
672*627f7eb2Smrg }
673*627f7eb2Smrg assert(0);
674*627f7eb2Smrg }
675*627f7eb2Smrg
676*627f7eb2Smrg @property size_t stackAvail()
677*627f7eb2Smrg {
678*627f7eb2Smrg return memory.length - lastState;
679*627f7eb2Smrg }
680*627f7eb2Smrg
681*627f7eb2Smrg void stackPush(T)(T val)
682*627f7eb2Smrg if (!isDynamicArray!T)
683*627f7eb2Smrg {
684*627f7eb2Smrg *cast(T*)&memory[lastState] = val;
685*627f7eb2Smrg enum delta = (T.sizeof+size_t.sizeof/2)/size_t.sizeof;
686*627f7eb2Smrg lastState += delta;
687*627f7eb2Smrg debug(std_regex_matcher) writeln("push element SP= ", lastState);
688*627f7eb2Smrg }
689*627f7eb2Smrg
690*627f7eb2Smrg void stackPush(T)(T[] val)
691*627f7eb2Smrg {
692*627f7eb2Smrg static assert(T.sizeof % size_t.sizeof == 0);
693*627f7eb2Smrg (cast(T*)&memory[lastState])[0 .. val.length]
694*627f7eb2Smrg = val[0..$];
695*627f7eb2Smrg lastState += val.length*(T.sizeof/size_t.sizeof);
696*627f7eb2Smrg debug(std_regex_matcher) writeln("push array SP= ", lastState);
697*627f7eb2Smrg }
698*627f7eb2Smrg
699*627f7eb2Smrg void stackPop(T)(ref T val)
700*627f7eb2Smrg if (!isDynamicArray!T)
701*627f7eb2Smrg {
702*627f7eb2Smrg enum delta = (T.sizeof+size_t.sizeof/2)/size_t.sizeof;
703*627f7eb2Smrg lastState -= delta;
704*627f7eb2Smrg val = *cast(T*)&memory[lastState];
705*627f7eb2Smrg debug(std_regex_matcher) writeln("pop element SP= ", lastState);
706*627f7eb2Smrg }
707*627f7eb2Smrg
708*627f7eb2Smrg void stackPop(T)(T[] val)
709*627f7eb2Smrg {
710*627f7eb2Smrg stackPop(val); // call ref version
711*627f7eb2Smrg }
712*627f7eb2Smrg void stackPop(T)(ref T[] val)
713*627f7eb2Smrg {
714*627f7eb2Smrg lastState -= val.length*(T.sizeof/size_t.sizeof);
715*627f7eb2Smrg val[0..$] = (cast(T*)&memory[lastState])[0 .. val.length];
716*627f7eb2Smrg debug(std_regex_matcher) writeln("pop array SP= ", lastState);
717*627f7eb2Smrg }
718*627f7eb2Smrg
719*627f7eb2Smrg static if (!CTregex)
720*627f7eb2Smrg {
721*627f7eb2Smrg //helper function, saves engine state
722*627f7eb2Smrg void pushState(uint pc, uint counter)
723*627f7eb2Smrg {
724*627f7eb2Smrg if (stateSize + 2 * matches.length > stackAvail)
725*627f7eb2Smrg {
726*627f7eb2Smrg newStack();
727*627f7eb2Smrg }
728*627f7eb2Smrg *cast(State*)&memory[lastState] =
729*627f7eb2Smrg State(index, pc, counter, infiniteNesting);
730*627f7eb2Smrg lastState += stateSize;
731*627f7eb2Smrg memory[lastState .. lastState + 2 * matches.length] = (cast(size_t[]) matches)[];
732*627f7eb2Smrg lastState += 2*matches.length;
733*627f7eb2Smrg debug(std_regex_matcher)
734*627f7eb2Smrg writefln("Saved(pc=%s) front: %s src: %s",
735*627f7eb2Smrg pc, front, s[index .. s.lastIndex]);
736*627f7eb2Smrg }
737*627f7eb2Smrg
738*627f7eb2Smrg //helper function, restores engine state
739*627f7eb2Smrg bool popState()
740*627f7eb2Smrg {
741*627f7eb2Smrg if (!lastState && !prevStack())
742*627f7eb2Smrg return false;
743*627f7eb2Smrg lastState -= 2*matches.length;
744*627f7eb2Smrg auto pm = cast(size_t[]) matches;
745*627f7eb2Smrg pm[] = memory[lastState .. lastState + 2 * matches.length];
746*627f7eb2Smrg lastState -= stateSize;
747*627f7eb2Smrg State* state = cast(State*)&memory[lastState];
748*627f7eb2Smrg index = state.index;
749*627f7eb2Smrg pc = state.pc;
750*627f7eb2Smrg counter = state.counter;
751*627f7eb2Smrg infiniteNesting = state.infiniteNesting;
752*627f7eb2Smrg debug(std_regex_matcher)
753*627f7eb2Smrg {
754*627f7eb2Smrg writefln("Restored matches", front, s[index .. s.lastIndex]);
755*627f7eb2Smrg foreach (i, m; matches)
756*627f7eb2Smrg writefln("Sub(%d) : %s..%s", i, m.begin, m.end);
757*627f7eb2Smrg }
758*627f7eb2Smrg s.reset(index);
759*627f7eb2Smrg next();
760*627f7eb2Smrg debug(std_regex_matcher)
761*627f7eb2Smrg writefln("Backtracked (pc=%s) front: %s src: %s",
762*627f7eb2Smrg pc, front, s[index .. s.lastIndex]);
763*627f7eb2Smrg return true;
764*627f7eb2Smrg }
765*627f7eb2Smrg }
766*627f7eb2Smrg }
767*627f7eb2Smrg }
768*627f7eb2Smrg
769*627f7eb2Smrg //very shitty string formatter, $$ replaced with next argument converted to string
770*627f7eb2Smrg @trusted string ctSub( U...)(string format, U args)
771*627f7eb2Smrg {
772*627f7eb2Smrg import std.conv : to;
773*627f7eb2Smrg bool seenDollar;
774*627f7eb2Smrg foreach (i, ch; format)
775*627f7eb2Smrg {
776*627f7eb2Smrg if (ch == '$')
777*627f7eb2Smrg {
778*627f7eb2Smrg if (seenDollar)
779*627f7eb2Smrg {
780*627f7eb2Smrg static if (args.length > 0)
781*627f7eb2Smrg {
782*627f7eb2Smrg return format[0 .. i - 1] ~ to!string(args[0])
783*627f7eb2Smrg ~ ctSub(format[i + 1 .. $], args[1 .. $]);
784*627f7eb2Smrg }
785*627f7eb2Smrg else
786*627f7eb2Smrg assert(0);
787*627f7eb2Smrg }
788*627f7eb2Smrg else
789*627f7eb2Smrg seenDollar = true;
790*627f7eb2Smrg }
791*627f7eb2Smrg else
792*627f7eb2Smrg seenDollar = false;
793*627f7eb2Smrg
794*627f7eb2Smrg }
795*627f7eb2Smrg return format;
796*627f7eb2Smrg }
797*627f7eb2Smrg
798*627f7eb2Smrg alias Sequence(int B, int E) = staticIota!(B, E);
799*627f7eb2Smrg
800*627f7eb2Smrg struct CtContext
801*627f7eb2Smrg {
802*627f7eb2Smrg import std.conv : to, text;
803*627f7eb2Smrg //dirty flags
804*627f7eb2Smrg bool counter;
805*627f7eb2Smrg //to mark the portion of matches to save
806*627f7eb2Smrg int match, total_matches;
807*627f7eb2Smrg int reserved;
808*627f7eb2Smrg CodepointSet[] charsets;
809*627f7eb2Smrg
810*627f7eb2Smrg
811*627f7eb2Smrg //state of codegenerator
812*627f7eb2Smrg static struct CtState
813*627f7eb2Smrg {
814*627f7eb2Smrg string code;
815*627f7eb2Smrg int addr;
816*627f7eb2Smrg }
817*627f7eb2Smrg
818*627f7eb2Smrg this(Char)(Regex!Char re)
819*627f7eb2Smrg {
820*627f7eb2Smrg match = 1;
821*627f7eb2Smrg reserved = 1; //first match is skipped
822*627f7eb2Smrg total_matches = re.ngroup;
823*627f7eb2Smrg charsets = re.charsets;
824*627f7eb2Smrg }
825*627f7eb2Smrg
826*627f7eb2Smrg CtContext lookaround(uint s, uint e)
827*627f7eb2Smrg {
828*627f7eb2Smrg CtContext ct;
829*627f7eb2Smrg ct.total_matches = e - s;
830*627f7eb2Smrg ct.match = 1;
831*627f7eb2Smrg return ct;
832*627f7eb2Smrg }
833*627f7eb2Smrg
834*627f7eb2Smrg //restore state having current context
835*627f7eb2Smrg string restoreCode()
836*627f7eb2Smrg {
837*627f7eb2Smrg string text;
838*627f7eb2Smrg //stack is checked in L_backtrack
839*627f7eb2Smrg text ~= counter
840*627f7eb2Smrg ? "
841*627f7eb2Smrg stackPop(counter);"
842*627f7eb2Smrg : "
843*627f7eb2Smrg counter = 0;";
844*627f7eb2Smrg if (match < total_matches)
845*627f7eb2Smrg {
846*627f7eb2Smrg text ~= ctSub("
847*627f7eb2Smrg stackPop(matches[$$..$$]);", reserved, match);
848*627f7eb2Smrg text ~= ctSub("
849*627f7eb2Smrg matches[$$..$] = typeof(matches[0]).init;", match);
850*627f7eb2Smrg }
851*627f7eb2Smrg else
852*627f7eb2Smrg text ~= ctSub("
853*627f7eb2Smrg stackPop(matches[$$..$]);", reserved);
854*627f7eb2Smrg return text;
855*627f7eb2Smrg }
856*627f7eb2Smrg
857*627f7eb2Smrg //save state having current context
858*627f7eb2Smrg string saveCode(uint pc, string count_expr="counter")
859*627f7eb2Smrg {
860*627f7eb2Smrg string text = ctSub("
861*627f7eb2Smrg if (stackAvail < $$*(Group!(DataIndex)).sizeof/size_t.sizeof + $$)
862*627f7eb2Smrg {
863*627f7eb2Smrg newStack();
864*627f7eb2Smrg }", match - reserved, cast(int) counter + 2);
865*627f7eb2Smrg if (match < total_matches)
866*627f7eb2Smrg text ~= ctSub("
867*627f7eb2Smrg stackPush(matches[$$..$$]);", reserved, match);
868*627f7eb2Smrg else
869*627f7eb2Smrg text ~= ctSub("
870*627f7eb2Smrg stackPush(matches[$$..$]);", reserved);
871*627f7eb2Smrg text ~= counter ? ctSub("
872*627f7eb2Smrg stackPush($$);", count_expr) : "";
873*627f7eb2Smrg text ~= ctSub("
874*627f7eb2Smrg stackPush(index); stackPush($$); \n", pc);
875*627f7eb2Smrg return text;
876*627f7eb2Smrg }
877*627f7eb2Smrg
878*627f7eb2Smrg //
879*627f7eb2Smrg CtState ctGenBlock(Bytecode[] ir, int addr)
880*627f7eb2Smrg {
881*627f7eb2Smrg CtState result;
882*627f7eb2Smrg result.addr = addr;
883*627f7eb2Smrg while (!ir.empty)
884*627f7eb2Smrg {
885*627f7eb2Smrg auto n = ctGenGroup(ir, result.addr);
886*627f7eb2Smrg result.code ~= n.code;
887*627f7eb2Smrg result.addr = n.addr;
888*627f7eb2Smrg }
889*627f7eb2Smrg return result;
890*627f7eb2Smrg }
891*627f7eb2Smrg
892*627f7eb2Smrg //
893*627f7eb2Smrg CtState ctGenGroup(ref Bytecode[] ir, int addr)
894*627f7eb2Smrg {
895*627f7eb2Smrg import std.algorithm.comparison : max;
896*627f7eb2Smrg auto bailOut = "goto L_backtrack;";
897*627f7eb2Smrg auto nextInstr = ctSub("goto case $$;", addr+1);
898*627f7eb2Smrg CtState r;
899*627f7eb2Smrg assert(!ir.empty);
900*627f7eb2Smrg switch (ir[0].code)
901*627f7eb2Smrg {
902*627f7eb2Smrg case IR.InfiniteStart, IR.InfiniteBloomStart,IR.InfiniteQStart, IR.RepeatStart, IR.RepeatQStart:
903*627f7eb2Smrg immutable infLoop =
904*627f7eb2Smrg ir[0].code == IR.InfiniteStart || ir[0].code == IR.InfiniteQStart ||
905*627f7eb2Smrg ir[0].code == IR.InfiniteBloomStart;
906*627f7eb2Smrg
907*627f7eb2Smrg counter = counter ||
908*627f7eb2Smrg ir[0].code == IR.RepeatStart || ir[0].code == IR.RepeatQStart;
909*627f7eb2Smrg immutable len = ir[0].data;
910*627f7eb2Smrg auto nir = ir[ir[0].length .. ir[0].length+len];
911*627f7eb2Smrg r = ctGenBlock(nir, addr+1);
912*627f7eb2Smrg //start/end codegen
913*627f7eb2Smrg //r.addr is at last test+ jump of loop, addr+1 is body of loop
914*627f7eb2Smrg nir = ir[ir[0].length + len .. $];
915*627f7eb2Smrg r.code = ctGenFixupCode(ir[0 .. ir[0].length], addr, r.addr) ~ r.code;
916*627f7eb2Smrg r.code ~= ctGenFixupCode(nir, r.addr, addr+1);
917*627f7eb2Smrg r.addr += 2; //account end instruction + restore state
918*627f7eb2Smrg ir = nir;
919*627f7eb2Smrg break;
920*627f7eb2Smrg case IR.OrStart:
921*627f7eb2Smrg immutable len = ir[0].data;
922*627f7eb2Smrg auto nir = ir[ir[0].length .. ir[0].length+len];
923*627f7eb2Smrg r = ctGenAlternation(nir, addr);
924*627f7eb2Smrg ir = ir[ir[0].length + len .. $];
925*627f7eb2Smrg assert(ir[0].code == IR.OrEnd);
926*627f7eb2Smrg ir = ir[ir[0].length..$];
927*627f7eb2Smrg break;
928*627f7eb2Smrg case IR.LookaheadStart:
929*627f7eb2Smrg case IR.NeglookaheadStart:
930*627f7eb2Smrg case IR.LookbehindStart:
931*627f7eb2Smrg case IR.NeglookbehindStart:
932*627f7eb2Smrg immutable len = ir[0].data;
933*627f7eb2Smrg immutable behind = ir[0].code == IR.LookbehindStart || ir[0].code == IR.NeglookbehindStart;
934*627f7eb2Smrg immutable negative = ir[0].code == IR.NeglookaheadStart || ir[0].code == IR.NeglookbehindStart;
935*627f7eb2Smrg string fwdType = "typeof(fwdMatcher(matcher, []))";
936*627f7eb2Smrg string bwdType = "typeof(bwdMatcher(matcher, []))";
937*627f7eb2Smrg string fwdCreate = "fwdMatcher(matcher, mem)";
938*627f7eb2Smrg string bwdCreate = "bwdMatcher(matcher, mem)";
939*627f7eb2Smrg immutable start = IRL!(IR.LookbehindStart);
940*627f7eb2Smrg immutable end = IRL!(IR.LookbehindStart)+len+IRL!(IR.LookaheadEnd);
941*627f7eb2Smrg CtContext context = lookaround(ir[1].raw, ir[2].raw); //split off new context
942*627f7eb2Smrg auto slice = ir[start .. end];
943*627f7eb2Smrg r.code ~= ctSub(`
944*627f7eb2Smrg case $$: //fake lookaround "atom"
945*627f7eb2Smrg static if (typeof(matcher.s).isLoopback)
946*627f7eb2Smrg alias Lookaround = $$;
947*627f7eb2Smrg else
948*627f7eb2Smrg alias Lookaround = $$;
949*627f7eb2Smrg static bool matcher_$$(ref Lookaround matcher) @trusted
950*627f7eb2Smrg {
951*627f7eb2Smrg //(neg)lookaround piece start
952*627f7eb2Smrg $$
953*627f7eb2Smrg //(neg)lookaround piece ends
954*627f7eb2Smrg }
955*627f7eb2Smrg auto save = index;
956*627f7eb2Smrg auto mem = malloc(initialMemory(re))[0 .. initialMemory(re)];
957*627f7eb2Smrg scope(exit) free(mem.ptr);
958*627f7eb2Smrg static if (typeof(matcher.s).isLoopback)
959*627f7eb2Smrg auto lookaround = $$;
960*627f7eb2Smrg else
961*627f7eb2Smrg auto lookaround = $$;
962*627f7eb2Smrg lookaround.matches = matches[$$..$$];
963*627f7eb2Smrg lookaround.backrefed = backrefed.empty ? matches : backrefed;
964*627f7eb2Smrg lookaround.nativeFn = &matcher_$$; //hookup closure's binary code
965*627f7eb2Smrg int match = $$;
966*627f7eb2Smrg s.reset(save);
967*627f7eb2Smrg next();
968*627f7eb2Smrg if (match)
969*627f7eb2Smrg $$
970*627f7eb2Smrg else
971*627f7eb2Smrg $$`, addr,
972*627f7eb2Smrg behind ? fwdType : bwdType, behind ? bwdType : fwdType,
973*627f7eb2Smrg addr, context.ctGenRegEx(slice),
974*627f7eb2Smrg behind ? fwdCreate : bwdCreate, behind ? bwdCreate : fwdCreate,
975*627f7eb2Smrg ir[1].raw, ir[2].raw, //start - end of matches slice
976*627f7eb2Smrg addr,
977*627f7eb2Smrg negative ? "!lookaround.matchImpl()" : "lookaround.matchImpl()",
978*627f7eb2Smrg nextInstr, bailOut);
979*627f7eb2Smrg ir = ir[end .. $];
980*627f7eb2Smrg r.addr = addr + 1;
981*627f7eb2Smrg break;
982*627f7eb2Smrg case IR.LookaheadEnd: case IR.NeglookaheadEnd:
983*627f7eb2Smrg case IR.LookbehindEnd: case IR.NeglookbehindEnd:
984*627f7eb2Smrg ir = ir[IRL!(IR.LookaheadEnd) .. $];
985*627f7eb2Smrg r.addr = addr;
986*627f7eb2Smrg break;
987*627f7eb2Smrg default:
988*627f7eb2Smrg assert(ir[0].isAtom, text(ir[0].mnemonic));
989*627f7eb2Smrg r = ctGenAtom(ir, addr);
990*627f7eb2Smrg }
991*627f7eb2Smrg return r;
992*627f7eb2Smrg }
993*627f7eb2Smrg
994*627f7eb2Smrg //generate source for bytecode contained in OrStart ... OrEnd
995*627f7eb2Smrg CtState ctGenAlternation(Bytecode[] ir, int addr)
996*627f7eb2Smrg {
997*627f7eb2Smrg CtState[] pieces;
998*627f7eb2Smrg CtState r;
999*627f7eb2Smrg enum optL = IRL!(IR.Option);
1000*627f7eb2Smrg for (;;)
1001*627f7eb2Smrg {
1002*627f7eb2Smrg assert(ir[0].code == IR.Option);
1003*627f7eb2Smrg auto len = ir[0].data;
1004*627f7eb2Smrg if (optL+len < ir.length && ir[optL+len].code == IR.Option)//not a last option
1005*627f7eb2Smrg {
1006*627f7eb2Smrg auto nir = ir[optL .. optL+len-IRL!(IR.GotoEndOr)];
1007*627f7eb2Smrg r = ctGenBlock(nir, addr+2);//space for Option + restore state
1008*627f7eb2Smrg //r.addr+1 to account GotoEndOr at end of branch
1009*627f7eb2Smrg r.code = ctGenFixupCode(ir[0 .. ir[0].length], addr, r.addr+1) ~ r.code;
1010*627f7eb2Smrg addr = r.addr+1;//leave space for GotoEndOr
1011*627f7eb2Smrg pieces ~= r;
1012*627f7eb2Smrg ir = ir[optL + len .. $];
1013*627f7eb2Smrg }
1014*627f7eb2Smrg else
1015*627f7eb2Smrg {
1016*627f7eb2Smrg pieces ~= ctGenBlock(ir[optL..$], addr);
1017*627f7eb2Smrg addr = pieces[$-1].addr;
1018*627f7eb2Smrg break;
1019*627f7eb2Smrg }
1020*627f7eb2Smrg }
1021*627f7eb2Smrg r = pieces[0];
1022*627f7eb2Smrg for (uint i = 1; i < pieces.length; i++)
1023*627f7eb2Smrg {
1024*627f7eb2Smrg r.code ~= ctSub(`
1025*627f7eb2Smrg case $$:
1026*627f7eb2Smrg goto case $$; `, pieces[i-1].addr, addr);
1027*627f7eb2Smrg r.code ~= pieces[i].code;
1028*627f7eb2Smrg }
1029*627f7eb2Smrg r.addr = addr;
1030*627f7eb2Smrg return r;
1031*627f7eb2Smrg }
1032*627f7eb2Smrg
1033*627f7eb2Smrg // generate fixup code for instruction in ir,
1034*627f7eb2Smrg // fixup means it has an alternative way for control flow
1035*627f7eb2Smrg string ctGenFixupCode(Bytecode[] ir, int addr, int fixup)
1036*627f7eb2Smrg {
1037*627f7eb2Smrg return ctGenFixupCode(ir, addr, fixup); // call ref Bytecode[] version
1038*627f7eb2Smrg }
1039*627f7eb2Smrg string ctGenFixupCode(ref Bytecode[] ir, int addr, int fixup)
1040*627f7eb2Smrg {
1041*627f7eb2Smrg string r;
1042*627f7eb2Smrg string testCode;
1043*627f7eb2Smrg r = ctSub(`
1044*627f7eb2Smrg case $$: debug(std_regex_matcher) writeln("#$$");`,
1045*627f7eb2Smrg addr, addr);
1046*627f7eb2Smrg switch (ir[0].code)
1047*627f7eb2Smrg {
1048*627f7eb2Smrg case IR.InfiniteStart, IR.InfiniteQStart, IR.InfiniteBloomStart:
1049*627f7eb2Smrg r ~= ctSub( `
1050*627f7eb2Smrg goto case $$;`, fixup);
1051*627f7eb2Smrg ir = ir[ir[0].length..$];
1052*627f7eb2Smrg break;
1053*627f7eb2Smrg case IR.InfiniteEnd:
1054*627f7eb2Smrg testCode = ctQuickTest(ir[IRL!(IR.InfiniteEnd) .. $],addr + 1);
1055*627f7eb2Smrg r ~= ctSub( `
1056*627f7eb2Smrg if (merge[$$+counter].mark(index))
1057*627f7eb2Smrg {
1058*627f7eb2Smrg // merged!
1059*627f7eb2Smrg goto L_backtrack;
1060*627f7eb2Smrg }
1061*627f7eb2Smrg
1062*627f7eb2Smrg $$
1063*627f7eb2Smrg {
1064*627f7eb2Smrg $$
1065*627f7eb2Smrg }
1066*627f7eb2Smrg goto case $$;
1067*627f7eb2Smrg case $$: //restore state and go out of loop
1068*627f7eb2Smrg $$
1069*627f7eb2Smrg goto case;`, ir[1].raw, testCode, saveCode(addr+1), fixup,
1070*627f7eb2Smrg addr+1, restoreCode());
1071*627f7eb2Smrg ir = ir[ir[0].length..$];
1072*627f7eb2Smrg break;
1073*627f7eb2Smrg case IR.InfiniteBloomEnd:
1074*627f7eb2Smrg //TODO: check bloom filter and skip on failure
1075*627f7eb2Smrg testCode = ctQuickTest(ir[IRL!(IR.InfiniteBloomEnd) .. $],addr + 1);
1076*627f7eb2Smrg r ~= ctSub( `
1077*627f7eb2Smrg if (merge[$$+counter].mark(index))
1078*627f7eb2Smrg {
1079*627f7eb2Smrg // merged!
1080*627f7eb2Smrg goto L_backtrack;
1081*627f7eb2Smrg }
1082*627f7eb2Smrg
1083*627f7eb2Smrg $$
1084*627f7eb2Smrg {
1085*627f7eb2Smrg $$
1086*627f7eb2Smrg }
1087*627f7eb2Smrg goto case $$;
1088*627f7eb2Smrg case $$: //restore state and go out of loop
1089*627f7eb2Smrg $$
1090*627f7eb2Smrg goto case;`, ir[1].raw, testCode, saveCode(addr+1), fixup,
1091*627f7eb2Smrg addr+1, restoreCode());
1092*627f7eb2Smrg ir = ir[ir[0].length..$];
1093*627f7eb2Smrg break;
1094*627f7eb2Smrg case IR.InfiniteQEnd:
1095*627f7eb2Smrg testCode = ctQuickTest(ir[IRL!(IR.InfiniteEnd) .. $],addr + 1);
1096*627f7eb2Smrg auto altCode = testCode.length ? ctSub("else goto case $$;", fixup) : "";
1097*627f7eb2Smrg r ~= ctSub( `
1098*627f7eb2Smrg if (merge[$$+counter].mark(index))
1099*627f7eb2Smrg {
1100*627f7eb2Smrg // merged!
1101*627f7eb2Smrg goto L_backtrack;
1102*627f7eb2Smrg }
1103*627f7eb2Smrg
1104*627f7eb2Smrg $$
1105*627f7eb2Smrg {
1106*627f7eb2Smrg $$
1107*627f7eb2Smrg goto case $$;
1108*627f7eb2Smrg }
1109*627f7eb2Smrg $$
1110*627f7eb2Smrg case $$://restore state and go inside loop
1111*627f7eb2Smrg $$
1112*627f7eb2Smrg goto case $$;`, ir[1].raw,
1113*627f7eb2Smrg testCode, saveCode(addr+1), addr+2, altCode,
1114*627f7eb2Smrg addr+1, restoreCode(), fixup);
1115*627f7eb2Smrg ir = ir[ir[0].length..$];
1116*627f7eb2Smrg break;
1117*627f7eb2Smrg case IR.RepeatStart, IR.RepeatQStart:
1118*627f7eb2Smrg r ~= ctSub( `
1119*627f7eb2Smrg goto case $$;`, fixup);
1120*627f7eb2Smrg ir = ir[ir[0].length..$];
1121*627f7eb2Smrg break;
1122*627f7eb2Smrg case IR.RepeatEnd, IR.RepeatQEnd:
1123*627f7eb2Smrg //len, step, min, max
1124*627f7eb2Smrg immutable len = ir[0].data;
1125*627f7eb2Smrg immutable step = ir[2].raw;
1126*627f7eb2Smrg immutable min = ir[3].raw;
1127*627f7eb2Smrg immutable max = ir[4].raw;
1128*627f7eb2Smrg r ~= ctSub(`
1129*627f7eb2Smrg if (merge[$$+counter].mark(index))
1130*627f7eb2Smrg {
1131*627f7eb2Smrg // merged!
1132*627f7eb2Smrg goto L_backtrack;
1133*627f7eb2Smrg }
1134*627f7eb2Smrg if (counter < $$)
1135*627f7eb2Smrg {
1136*627f7eb2Smrg debug(std_regex_matcher) writeln("RepeatEnd min case pc=", $$);
1137*627f7eb2Smrg counter += $$;
1138*627f7eb2Smrg goto case $$;
1139*627f7eb2Smrg }`, ir[1].raw, min, addr, step, fixup);
1140*627f7eb2Smrg if (ir[0].code == IR.RepeatEnd)
1141*627f7eb2Smrg {
1142*627f7eb2Smrg string counter_expr = ctSub("counter % $$", step);
1143*627f7eb2Smrg r ~= ctSub(`
1144*627f7eb2Smrg else if (counter < $$)
1145*627f7eb2Smrg {
1146*627f7eb2Smrg $$
1147*627f7eb2Smrg counter += $$;
1148*627f7eb2Smrg goto case $$;
1149*627f7eb2Smrg }`, max, saveCode(addr+1, counter_expr), step, fixup);
1150*627f7eb2Smrg }
1151*627f7eb2Smrg else
1152*627f7eb2Smrg {
1153*627f7eb2Smrg string counter_expr = ctSub("counter % $$", step);
1154*627f7eb2Smrg r ~= ctSub(`
1155*627f7eb2Smrg else if (counter < $$)
1156*627f7eb2Smrg {
1157*627f7eb2Smrg $$
1158*627f7eb2Smrg counter = counter % $$;
1159*627f7eb2Smrg goto case $$;
1160*627f7eb2Smrg }`, max, saveCode(addr+1,counter_expr), step, addr+2);
1161*627f7eb2Smrg }
1162*627f7eb2Smrg r ~= ctSub(`
1163*627f7eb2Smrg else
1164*627f7eb2Smrg {
1165*627f7eb2Smrg counter = counter % $$;
1166*627f7eb2Smrg goto case $$;
1167*627f7eb2Smrg }
1168*627f7eb2Smrg case $$: //restore state
1169*627f7eb2Smrg $$
1170*627f7eb2Smrg goto case $$;`, step, addr+2, addr+1, restoreCode(),
1171*627f7eb2Smrg ir[0].code == IR.RepeatEnd ? addr+2 : fixup );
1172*627f7eb2Smrg ir = ir[ir[0].length..$];
1173*627f7eb2Smrg break;
1174*627f7eb2Smrg case IR.Option:
1175*627f7eb2Smrg r ~= ctSub( `
1176*627f7eb2Smrg {
1177*627f7eb2Smrg $$
1178*627f7eb2Smrg }
1179*627f7eb2Smrg goto case $$;
1180*627f7eb2Smrg case $$://restore thunk to go to the next group
1181*627f7eb2Smrg $$
1182*627f7eb2Smrg goto case $$;`, saveCode(addr+1), addr+2,
1183*627f7eb2Smrg addr+1, restoreCode(), fixup);
1184*627f7eb2Smrg ir = ir[ir[0].length..$];
1185*627f7eb2Smrg break;
1186*627f7eb2Smrg default:
1187*627f7eb2Smrg assert(0, text(ir[0].mnemonic));
1188*627f7eb2Smrg }
1189*627f7eb2Smrg return r;
1190*627f7eb2Smrg }
1191*627f7eb2Smrg
1192*627f7eb2Smrg
1193*627f7eb2Smrg string ctQuickTest(Bytecode[] ir, int id)
1194*627f7eb2Smrg {
1195*627f7eb2Smrg uint pc = 0;
1196*627f7eb2Smrg while (pc < ir.length && ir[pc].isAtom)
1197*627f7eb2Smrg {
1198*627f7eb2Smrg if (ir[pc].code == IR.GroupStart || ir[pc].code == IR.GroupEnd)
1199*627f7eb2Smrg {
1200*627f7eb2Smrg pc++;
1201*627f7eb2Smrg }
1202*627f7eb2Smrg else if (ir[pc].code == IR.Backref)
1203*627f7eb2Smrg break;
1204*627f7eb2Smrg else
1205*627f7eb2Smrg {
1206*627f7eb2Smrg auto code = ctAtomCode(ir[pc..$], -1);
1207*627f7eb2Smrg return ctSub(`
1208*627f7eb2Smrg int test_$$()
1209*627f7eb2Smrg {
1210*627f7eb2Smrg $$ //$$
1211*627f7eb2Smrg }
1212*627f7eb2Smrg if (test_$$() >= 0)`, id, code.ptr ? code : "return 0;",
1213*627f7eb2Smrg ir[pc].mnemonic, id);
1214*627f7eb2Smrg }
1215*627f7eb2Smrg }
1216*627f7eb2Smrg return "";
1217*627f7eb2Smrg }
1218*627f7eb2Smrg
1219*627f7eb2Smrg //process & generate source for simple bytecodes at front of ir using address addr
1220*627f7eb2Smrg CtState ctGenAtom(ref Bytecode[] ir, int addr)
1221*627f7eb2Smrg {
1222*627f7eb2Smrg CtState result;
1223*627f7eb2Smrg result.code = ctAtomCode(ir, addr);
1224*627f7eb2Smrg ir.popFrontN(ir[0].code == IR.OrChar ? ir[0].sequence : ir[0].length);
1225*627f7eb2Smrg result.addr = addr + 1;
1226*627f7eb2Smrg return result;
1227*627f7eb2Smrg }
1228*627f7eb2Smrg
1229*627f7eb2Smrg //D code for atom at ir using address addr, addr < 0 means quickTest
1230*627f7eb2Smrg string ctAtomCode(Bytecode[] ir, int addr)
1231*627f7eb2Smrg {
1232*627f7eb2Smrg string code;
1233*627f7eb2Smrg string bailOut, nextInstr;
1234*627f7eb2Smrg if (addr < 0)
1235*627f7eb2Smrg {
1236*627f7eb2Smrg bailOut = "return -1;";
1237*627f7eb2Smrg nextInstr = "return 0;";
1238*627f7eb2Smrg }
1239*627f7eb2Smrg else
1240*627f7eb2Smrg {
1241*627f7eb2Smrg bailOut = "goto L_backtrack;";
1242*627f7eb2Smrg nextInstr = ctSub("goto case $$;", addr+1);
1243*627f7eb2Smrg code ~= ctSub( `
1244*627f7eb2Smrg case $$: debug(std_regex_matcher) writeln("#$$");
1245*627f7eb2Smrg `, addr, addr);
1246*627f7eb2Smrg }
1247*627f7eb2Smrg switch (ir[0].code)
1248*627f7eb2Smrg {
1249*627f7eb2Smrg case IR.OrChar://assumes IRL!(OrChar) == 1
1250*627f7eb2Smrg code ~= ctSub(`
1251*627f7eb2Smrg if (atEnd)
1252*627f7eb2Smrg $$`, bailOut);
1253*627f7eb2Smrg immutable len = ir[0].sequence;
1254*627f7eb2Smrg for (uint i = 0; i < len; i++)
1255*627f7eb2Smrg {
1256*627f7eb2Smrg code ~= ctSub( `
1257*627f7eb2Smrg if (front == $$)
1258*627f7eb2Smrg {
1259*627f7eb2Smrg $$
1260*627f7eb2Smrg $$
1261*627f7eb2Smrg }`, ir[i].data, addr >= 0 ? "next();" :"", nextInstr);
1262*627f7eb2Smrg }
1263*627f7eb2Smrg code ~= ctSub( `
1264*627f7eb2Smrg $$`, bailOut);
1265*627f7eb2Smrg break;
1266*627f7eb2Smrg case IR.Char:
1267*627f7eb2Smrg code ~= ctSub( `
1268*627f7eb2Smrg if (atEnd || front != $$)
1269*627f7eb2Smrg $$
1270*627f7eb2Smrg $$
1271*627f7eb2Smrg $$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr);
1272*627f7eb2Smrg break;
1273*627f7eb2Smrg case IR.Any:
1274*627f7eb2Smrg code ~= ctSub( `
1275*627f7eb2Smrg if (atEnd || (!(re.flags & RegexOption.singleline)
1276*627f7eb2Smrg && (front == '\r' || front == '\n')))
1277*627f7eb2Smrg $$
1278*627f7eb2Smrg $$
1279*627f7eb2Smrg $$`, bailOut, addr >= 0 ? "next();" :"",nextInstr);
1280*627f7eb2Smrg break;
1281*627f7eb2Smrg case IR.CodepointSet:
1282*627f7eb2Smrg if (charsets.length)
1283*627f7eb2Smrg {
1284*627f7eb2Smrg string name = `func_`~to!string(addr+1);
1285*627f7eb2Smrg string funcCode = charsets[ir[0].data].toSourceCode(name);
1286*627f7eb2Smrg code ~= ctSub( `
1287*627f7eb2Smrg static $$
1288*627f7eb2Smrg if (atEnd || !$$(front))
1289*627f7eb2Smrg $$
1290*627f7eb2Smrg $$
1291*627f7eb2Smrg $$`, funcCode, name, bailOut, addr >= 0 ? "next();" :"", nextInstr);
1292*627f7eb2Smrg }
1293*627f7eb2Smrg else
1294*627f7eb2Smrg code ~= ctSub( `
1295*627f7eb2Smrg if (atEnd || !re.charsets[$$].scanFor(front))
1296*627f7eb2Smrg $$
1297*627f7eb2Smrg $$
1298*627f7eb2Smrg $$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr);
1299*627f7eb2Smrg break;
1300*627f7eb2Smrg case IR.Trie:
1301*627f7eb2Smrg if (charsets.length && charsets[ir[0].data].byInterval.length <= 8)
1302*627f7eb2Smrg goto case IR.CodepointSet;
1303*627f7eb2Smrg code ~= ctSub( `
1304*627f7eb2Smrg if (atEnd || !re.matchers[$$][front])
1305*627f7eb2Smrg $$
1306*627f7eb2Smrg $$
1307*627f7eb2Smrg $$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr);
1308*627f7eb2Smrg break;
1309*627f7eb2Smrg case IR.Wordboundary:
1310*627f7eb2Smrg code ~= ctSub( `
1311*627f7eb2Smrg dchar back;
1312*627f7eb2Smrg DataIndex bi;
1313*627f7eb2Smrg if (atStart && wordMatcher[front])
1314*627f7eb2Smrg {
1315*627f7eb2Smrg $$
1316*627f7eb2Smrg }
1317*627f7eb2Smrg else if (atEnd && s.loopBack(index).nextChar(back, bi)
1318*627f7eb2Smrg && wordMatcher[back])
1319*627f7eb2Smrg {
1320*627f7eb2Smrg $$
1321*627f7eb2Smrg }
1322*627f7eb2Smrg else if (s.loopBack(index).nextChar(back, bi))
1323*627f7eb2Smrg {
1324*627f7eb2Smrg bool af = wordMatcher[front];
1325*627f7eb2Smrg bool ab = wordMatcher[back];
1326*627f7eb2Smrg if (af ^ ab)
1327*627f7eb2Smrg {
1328*627f7eb2Smrg $$
1329*627f7eb2Smrg }
1330*627f7eb2Smrg }
1331*627f7eb2Smrg $$`, nextInstr, nextInstr, nextInstr, bailOut);
1332*627f7eb2Smrg break;
1333*627f7eb2Smrg case IR.Notwordboundary:
1334*627f7eb2Smrg code ~= ctSub( `
1335*627f7eb2Smrg dchar back;
1336*627f7eb2Smrg DataIndex bi;
1337*627f7eb2Smrg //at start & end of input
1338*627f7eb2Smrg if (atStart && wordMatcher[front])
1339*627f7eb2Smrg $$
1340*627f7eb2Smrg else if (atEnd && s.loopBack(index).nextChar(back, bi)
1341*627f7eb2Smrg && wordMatcher[back])
1342*627f7eb2Smrg $$
1343*627f7eb2Smrg else if (s.loopBack(index).nextChar(back, bi))
1344*627f7eb2Smrg {
1345*627f7eb2Smrg bool af = wordMatcher[front];
1346*627f7eb2Smrg bool ab = wordMatcher[back];
1347*627f7eb2Smrg if (af ^ ab)
1348*627f7eb2Smrg $$
1349*627f7eb2Smrg }
1350*627f7eb2Smrg $$`, bailOut, bailOut, bailOut, nextInstr);
1351*627f7eb2Smrg
1352*627f7eb2Smrg break;
1353*627f7eb2Smrg case IR.Bol:
1354*627f7eb2Smrg code ~= ctSub(`
1355*627f7eb2Smrg dchar back;
1356*627f7eb2Smrg DataIndex bi;
1357*627f7eb2Smrg if (atStart || (s.loopBack(index).nextChar(back,bi)
1358*627f7eb2Smrg && endOfLine(back, front == '\n')))
1359*627f7eb2Smrg {
1360*627f7eb2Smrg debug(std_regex_matcher) writeln("BOL matched");
1361*627f7eb2Smrg $$
1362*627f7eb2Smrg }
1363*627f7eb2Smrg else
1364*627f7eb2Smrg $$`, nextInstr, bailOut);
1365*627f7eb2Smrg
1366*627f7eb2Smrg break;
1367*627f7eb2Smrg case IR.Bof:
1368*627f7eb2Smrg code ~= ctSub(`
1369*627f7eb2Smrg if (atStart)
1370*627f7eb2Smrg {
1371*627f7eb2Smrg debug(std_regex_matcher) writeln("BOF matched");
1372*627f7eb2Smrg $$
1373*627f7eb2Smrg }
1374*627f7eb2Smrg else
1375*627f7eb2Smrg $$`, nextInstr, bailOut);
1376*627f7eb2Smrg break;
1377*627f7eb2Smrg case IR.Eol:
1378*627f7eb2Smrg code ~= ctSub(`
1379*627f7eb2Smrg dchar back;
1380*627f7eb2Smrg DataIndex bi;
1381*627f7eb2Smrg debug(std_regex_matcher) writefln("EOL (front 0x%x) %s", front, s[index .. s.lastIndex]);
1382*627f7eb2Smrg //no matching inside \r\n
1383*627f7eb2Smrg if (atEnd || (endOfLine(front, s.loopBack(index).nextChar(back,bi)
1384*627f7eb2Smrg && back == '\r')))
1385*627f7eb2Smrg {
1386*627f7eb2Smrg debug(std_regex_matcher) writeln("EOL matched");
1387*627f7eb2Smrg $$
1388*627f7eb2Smrg }
1389*627f7eb2Smrg else
1390*627f7eb2Smrg $$`, nextInstr, bailOut);
1391*627f7eb2Smrg break;
1392*627f7eb2Smrg case IR.Eof:
1393*627f7eb2Smrg code ~= ctSub(`
1394*627f7eb2Smrg if (atEnd)
1395*627f7eb2Smrg {
1396*627f7eb2Smrg debug(std_regex_matcher) writeln("BOF matched");
1397*627f7eb2Smrg $$
1398*627f7eb2Smrg }
1399*627f7eb2Smrg else
1400*627f7eb2Smrg $$`, nextInstr, bailOut);
1401*627f7eb2Smrg break;
1402*627f7eb2Smrg case IR.GroupStart:
1403*627f7eb2Smrg code ~= ctSub(`
1404*627f7eb2Smrg matches[$$].begin = index;
1405*627f7eb2Smrg $$`, ir[0].data, nextInstr);
1406*627f7eb2Smrg match = ir[0].data+1;
1407*627f7eb2Smrg break;
1408*627f7eb2Smrg case IR.GroupEnd:
1409*627f7eb2Smrg code ~= ctSub(`
1410*627f7eb2Smrg matches[$$].end = index;
1411*627f7eb2Smrg $$`, ir[0].data, nextInstr);
1412*627f7eb2Smrg break;
1413*627f7eb2Smrg case IR.Backref:
1414*627f7eb2Smrg string mStr = "auto referenced = ";
1415*627f7eb2Smrg mStr ~= ir[0].localRef
1416*627f7eb2Smrg ? ctSub("s[matches[$$].begin .. matches[$$].end];",
1417*627f7eb2Smrg ir[0].data, ir[0].data)
1418*627f7eb2Smrg : ctSub("s[backrefed[$$].begin .. backrefed[$$].end];",
1419*627f7eb2Smrg ir[0].data, ir[0].data);
1420*627f7eb2Smrg code ~= ctSub( `
1421*627f7eb2Smrg $$
1422*627f7eb2Smrg while (!atEnd && !referenced.empty && front == referenced.front)
1423*627f7eb2Smrg {
1424*627f7eb2Smrg next();
1425*627f7eb2Smrg referenced.popFront();
1426*627f7eb2Smrg }
1427*627f7eb2Smrg if (referenced.empty)
1428*627f7eb2Smrg $$
1429*627f7eb2Smrg else
1430*627f7eb2Smrg $$`, mStr, nextInstr, bailOut);
1431*627f7eb2Smrg break;
1432*627f7eb2Smrg case IR.Nop:
1433*627f7eb2Smrg case IR.End:
1434*627f7eb2Smrg break;
1435*627f7eb2Smrg default:
1436*627f7eb2Smrg assert(0, text(ir[0].mnemonic, " is not supported yet"));
1437*627f7eb2Smrg }
1438*627f7eb2Smrg return code;
1439*627f7eb2Smrg }
1440*627f7eb2Smrg
1441*627f7eb2Smrg //generate D code for the whole regex
1442*627f7eb2Smrg public string ctGenRegEx(Bytecode[] ir)
1443*627f7eb2Smrg {
1444*627f7eb2Smrg auto bdy = ctGenBlock(ir, 0);
1445*627f7eb2Smrg auto r = `
1446*627f7eb2Smrg import core.stdc.stdlib;
1447*627f7eb2Smrg with(matcher)
1448*627f7eb2Smrg {
1449*627f7eb2Smrg pc = 0;
1450*627f7eb2Smrg counter = 0;
1451*627f7eb2Smrg lastState = 0;
1452*627f7eb2Smrg matches[] = Group!DataIndex.init;
1453*627f7eb2Smrg auto start = s._index;`;
1454*627f7eb2Smrg r ~= `
1455*627f7eb2Smrg goto StartLoop;
1456*627f7eb2Smrg debug(std_regex_matcher) writeln("Try CT matching starting at ",s[index .. s.lastIndex]);
1457*627f7eb2Smrg L_backtrack:
1458*627f7eb2Smrg if (lastState || prevStack())
1459*627f7eb2Smrg {
1460*627f7eb2Smrg stackPop(pc);
1461*627f7eb2Smrg stackPop(index);
1462*627f7eb2Smrg s.reset(index);
1463*627f7eb2Smrg next();
1464*627f7eb2Smrg }
1465*627f7eb2Smrg else
1466*627f7eb2Smrg {
1467*627f7eb2Smrg s.reset(start);
1468*627f7eb2Smrg return false;
1469*627f7eb2Smrg }
1470*627f7eb2Smrg StartLoop:
1471*627f7eb2Smrg switch (pc)
1472*627f7eb2Smrg {
1473*627f7eb2Smrg `;
1474*627f7eb2Smrg r ~= bdy.code;
1475*627f7eb2Smrg r ~= ctSub(`
1476*627f7eb2Smrg case $$: break;`,bdy.addr);
1477*627f7eb2Smrg r ~= `
1478*627f7eb2Smrg default:
1479*627f7eb2Smrg assert(0);
1480*627f7eb2Smrg }
1481*627f7eb2Smrg // cleanup stale stack blocks
1482*627f7eb2Smrg while (prevStack()) {}
1483*627f7eb2Smrg return true;
1484*627f7eb2Smrg }
1485*627f7eb2Smrg `;
1486*627f7eb2Smrg return r;
1487*627f7eb2Smrg }
1488*627f7eb2Smrg
1489*627f7eb2Smrg }
1490*627f7eb2Smrg
1491*627f7eb2Smrg string ctGenRegExCode(Char)(Regex!Char re)
1492*627f7eb2Smrg {
1493*627f7eb2Smrg auto context = CtContext(re);
1494*627f7eb2Smrg return context.ctGenRegEx(re.ir);
1495*627f7eb2Smrg }
1496