1 /*
2 Implementation of backtracking std.regex engine.
3 Contains both compile-time and run-time versions.
4 */
5 module std.regex.internal.backtracking;
6
7 package(std.regex):
8
9 import core.stdc.stdlib, std.range.primitives, std.traits, std.typecons;
10 import std.regex.internal.ir;
11
12 /+
13 BacktrackingMatcher implements backtracking scheme of matching
14 regular expressions.
15 +/
BacktrackingMatcher(bool CTregex)16 template BacktrackingMatcher(bool CTregex)
17 {
18 @trusted struct BacktrackingMatcher(Char, Stream = Input!Char)
19 if (is(Char : dchar))
20 {
21 alias DataIndex = Stream.DataIndex;
22 struct State
23 {//top bit in pc is set if saved along with matches
24 DataIndex index;
25 uint pc, counter, infiniteNesting;
26 }
27 static assert(State.sizeof % size_t.sizeof == 0);
28 enum stateSize = State.sizeof / size_t.sizeof;
29 enum initialStack = 1 << 11; // items in a block of segmented stack
30 alias String = const(Char)[];
31 alias RegEx = Regex!Char;
32 alias MatchFn = bool function (ref BacktrackingMatcher!(Char, Stream));
33 RegEx re; //regex program
34 static if (CTregex)
35 MatchFn nativeFn; //native code for that program
36 //Stream state
37 Stream s;
38 DataIndex index;
39 dchar front;
40 bool exhausted;
41 //backtracking machine state
42 uint pc, counter;
43 DataIndex lastState = 0; //top of state stack
44 static if (!CTregex)
45 uint infiniteNesting;
46 size_t[] memory;
47 Trace[] merge;
48 static struct Trace
49 {
50 ulong mask;
51 size_t offset;
52
53 bool mark(size_t idx)
54 {
55 immutable d = idx - offset;
56 if (d < 64) // including overflow
57 {
58 immutable p = mask & (1UL << d);
59 mask |= 1UL << d;
60 return p != 0;
61 }
62 else
63 {
64 offset = idx;
65 mask = 1;
66 return false;
67 }
68 }
69 }
70 //local slice of matches, global for backref
71 Group!DataIndex[] matches, backrefed;
72
73 static if (__traits(hasMember,Stream, "search"))
74 {
75 enum kicked = true;
76 }
77 else
78 enum kicked = false;
79
80 static size_t initialMemory(const ref RegEx re)
81 {
82 return stackSize(re)*size_t.sizeof + re.hotspotTableSize*Trace.sizeof;
83 }
84
85 static size_t stackSize(const ref RegEx re)
86 {
87 size_t itemSize = stateSize
88 + re.ngroup * (Group!DataIndex).sizeof / size_t.sizeof;
89 return initialStack * itemSize + 2;
90 }
91
92 @property bool atStart(){ return index == 0; }
93
94 @property bool atEnd(){ return index == s.lastIndex && s.atEnd; }
95
96 void next()
97 {
98 if (!s.nextChar(front, index))
99 index = s.lastIndex;
100 }
101
102 void search()
103 {
104 static if (kicked)
105 {
106 if (!s.search(re.kickstart, front, index))
107 {
108 index = s.lastIndex;
109 }
110 }
111 else
112 next();
113 }
114
115 //
116 void newStack()
117 {
118 auto chunk = mallocArray!(size_t)(stackSize(re));
119 chunk[0] = cast(size_t)(memory.ptr);
120 chunk[1] = lastState;
121 memory = chunk[2..$];
122 lastState = 0;
123 }
124
125 bool prevStack()
126 {
127 // pointer to previous block
128 size_t* prev = cast(size_t*) memory.ptr[-2];
129 if (!prev)
130 {
131 // The last segment is freed in RegexMatch
132 return false;
133 }
134 else
135 {
136 import core.stdc.stdlib : free;
137 // memory used in previous block
138 size_t size = memory.ptr[-1];
139 free(memory.ptr-2);
140 memory = prev[0 .. size];
141 lastState = size;
142 return true;
143 }
144 }
145
146 void initExternalMemory(void[] memBlock)
147 {
148 merge = arrayInChunk!(Trace)(re.hotspotTableSize, memBlock);
149 merge[] = Trace.init;
150 memory = cast(size_t[]) memBlock;
151 memory[0] = 0; // hidden pointer
152 memory[1] = 0; // used size
153 memory = memory[2..$];
154 }
155
156 void initialize(ref RegEx program, Stream stream, void[] memBlock)
157 {
158 re = program;
159 s = stream;
160 exhausted = false;
161 initExternalMemory(memBlock);
162 backrefed = null;
163 }
164
165 auto dupTo(void[] memory)
166 {
167 typeof(this) tmp = this;
168 tmp.initExternalMemory(memory);
169 return tmp;
170 }
171
172 this(ref RegEx program, Stream stream, void[] memBlock, dchar ch, DataIndex idx)
173 {
174 initialize(program, stream, memBlock);
175 front = ch;
176 index = idx;
177 }
178
179 this(ref RegEx program, Stream stream, void[] memBlock)
180 {
181 initialize(program, stream, memBlock);
182 next();
183 }
184
185 auto fwdMatcher(ref BacktrackingMatcher matcher, void[] memBlock)
186 {
187 alias BackMatcherTempl = .BacktrackingMatcher!(CTregex);
188 alias BackMatcher = BackMatcherTempl!(Char, Stream);
189 auto fwdMatcher = BackMatcher(matcher.re, s, memBlock, front, index);
190 return fwdMatcher;
191 }
192
193 auto bwdMatcher(ref BacktrackingMatcher matcher, void[] memBlock)
194 {
195 alias BackMatcherTempl = .BacktrackingMatcher!(CTregex);
196 alias BackMatcher = BackMatcherTempl!(Char, typeof(s.loopBack(index)));
197 auto fwdMatcher =
198 BackMatcher(matcher.re, s.loopBack(index), memBlock);
199 return fwdMatcher;
200 }
201
202 //
203 int matchFinalize()
204 {
205 immutable start = index;
206 immutable val = matchImpl();
207 if (val)
208 {//stream is updated here
209 matches[0].begin = start;
210 matches[0].end = index;
211 if (!(re.flags & RegexOption.global) || atEnd)
212 exhausted = true;
213 if (start == index)//empty match advances input
214 next();
215 return val;
216 }
217 else
218 return 0;
219 }
220
221 //lookup next match, fill matches with indices into input
222 int match(Group!DataIndex[] matches)
223 {
224 debug(std_regex_matcher)
225 {
226 writeln("------------------------------------------");
227 }
228 if (exhausted) //all matches collected
229 return false;
230 this.matches = matches;
231 if (re.flags & RegexInfo.oneShot)
232 {
233 exhausted = true;
234 const DataIndex start = index;
235 immutable m = matchImpl();
236 if (m)
237 {
238 matches[0].begin = start;
239 matches[0].end = index;
240 }
241 return m;
242 }
243 static if (kicked)
244 {
245 if (!re.kickstart.empty)
246 {
247 for (;;)
248 {
249 immutable val = matchFinalize();
250 if (val)
251 return val;
252 else
253 {
254 if (atEnd)
255 break;
256 search();
257 if (atEnd)
258 {
259 exhausted = true;
260 return matchFinalize();
261 }
262 }
263 }
264 exhausted = true;
265 return 0; //early return
266 }
267 }
268 //no search available - skip a char at a time
269 for (;;)
270 {
271 immutable val = matchFinalize();
272 if (val)
273 return val;
274 else
275 {
276 if (atEnd)
277 break;
278 next();
279 if (atEnd)
280 {
281 exhausted = true;
282 return matchFinalize();
283 }
284 }
285 }
286 exhausted = true;
287 return 0;
288 }
289
290 /+
291 match subexpression against input,
292 results are stored in matches
293 +/
294 int matchImpl()
295 {
296 static if (CTregex && is(typeof(nativeFn(this))))
297 {
298 debug(std_regex_ctr) writeln("using C-T matcher");
299 return nativeFn(this);
300 }
301 else
302 {
303 pc = 0;
304 counter = 0;
305 lastState = 0;
306 matches[] = Group!DataIndex.init;
307 auto start = s._index;
308 debug(std_regex_matcher)
309 writeln("Try match starting at ", s[index .. s.lastIndex]);
310 for (;;)
311 {
312 debug(std_regex_matcher)
313 writefln("PC: %s\tCNT: %s\t%s \tfront: %s src: %s",
314 pc, counter, disassemble(re.ir, pc, re.dict),
315 front, s._index);
316 switch (re.ir[pc].code)
317 {
318 case IR.OrChar://assumes IRL!(OrChar) == 1
319 if (atEnd)
320 goto L_backtrack;
321 uint len = re.ir[pc].sequence;
322 uint end = pc + len;
323 if (re.ir[pc].data != front && re.ir[pc+1].data != front)
324 {
325 for (pc = pc+2; pc < end; pc++)
326 if (re.ir[pc].data == front)
327 break;
328 if (pc == end)
329 goto L_backtrack;
330 }
331 pc = end;
332 next();
333 break;
334 case IR.Char:
335 if (atEnd || front != re.ir[pc].data)
336 goto L_backtrack;
337 pc += IRL!(IR.Char);
338 next();
339 break;
340 case IR.Any:
341 if (atEnd)
342 goto L_backtrack;
343 pc += IRL!(IR.Any);
344 next();
345 break;
346 case IR.CodepointSet:
347 if (atEnd || !re.charsets[re.ir[pc].data].scanFor(front))
348 goto L_backtrack;
349 next();
350 pc += IRL!(IR.CodepointSet);
351 break;
352 case IR.Trie:
353 if (atEnd || !re.matchers[re.ir[pc].data][front])
354 goto L_backtrack;
355 next();
356 pc += IRL!(IR.Trie);
357 break;
358 case IR.Wordboundary:
359 dchar back;
360 DataIndex bi;
361 //at start & end of input
362 if (atStart && wordMatcher[front])
363 {
364 pc += IRL!(IR.Wordboundary);
365 break;
366 }
367 else if (atEnd && s.loopBack(index).nextChar(back, bi)
368 && wordMatcher[back])
369 {
370 pc += IRL!(IR.Wordboundary);
371 break;
372 }
373 else if (s.loopBack(index).nextChar(back, bi))
374 {
375 immutable af = wordMatcher[front];
376 immutable ab = wordMatcher[back];
377 if (af ^ ab)
378 {
379 pc += IRL!(IR.Wordboundary);
380 break;
381 }
382 }
383 goto L_backtrack;
384 case IR.Notwordboundary:
385 dchar back;
386 DataIndex bi;
387 //at start & end of input
388 if (atStart && wordMatcher[front])
389 goto L_backtrack;
390 else if (atEnd && s.loopBack(index).nextChar(back, bi)
391 && wordMatcher[back])
392 goto L_backtrack;
393 else if (s.loopBack(index).nextChar(back, bi))
394 {
395 immutable af = wordMatcher[front];
396 immutable ab = wordMatcher[back];
397 if (af ^ ab)
398 goto L_backtrack;
399 }
400 pc += IRL!(IR.Wordboundary);
401 break;
402 case IR.Bof:
403 if (atStart)
404 pc += IRL!(IR.Bol);
405 else
406 goto L_backtrack;
407 break;
408 case IR.Bol:
409 dchar back;
410 DataIndex bi;
411 if (atStart)
412 pc += IRL!(IR.Bol);
413 else if (s.loopBack(index).nextChar(back,bi)
414 && endOfLine(back, front == '\n'))
415 {
416 pc += IRL!(IR.Bol);
417 }
418 else
419 goto L_backtrack;
420 break;
421 case IR.Eof:
422 if (atEnd)
423 pc += IRL!(IR.Eol);
424 else
425 goto L_backtrack;
426 break;
427 case IR.Eol:
428 dchar back;
429 DataIndex bi;
430 debug(std_regex_matcher) writefln("EOL (front 0x%x) %s", front, s[index .. s.lastIndex]);
431 //no matching inside \r\n
432 if (atEnd || (endOfLine(front, s.loopBack(index).nextChar(back,bi)
433 && back == '\r')))
434 {
435 pc += IRL!(IR.Eol);
436 }
437 else
438 goto L_backtrack;
439 break;
440 case IR.InfiniteStart, IR.InfiniteQStart:
441 pc += re.ir[pc].data + IRL!(IR.InfiniteStart);
442 //now pc is at end IR.Infinite(Q)End
443 uint len = re.ir[pc].data;
444 if (re.ir[pc].code == IR.InfiniteEnd)
445 {
446 pushState(pc+IRL!(IR.InfiniteEnd), counter);
447 pc -= len;
448 }
449 else
450 {
451 pushState(pc - len, counter);
452 pc += IRL!(IR.InfiniteEnd);
453 }
454 break;
455 case IR.InfiniteBloomStart:
456 pc += re.ir[pc].data + IRL!(IR.InfiniteBloomStart);
457 //now pc is at end IR.InfiniteBloomEnd
458 immutable len = re.ir[pc].data;
459 immutable filterIdx = re.ir[pc+2].raw;
460 if (re.filters[filterIdx][front])
461 pushState(pc+IRL!(IR.InfiniteBloomEnd), counter);
462 pc -= len;
463 break;
464 case IR.RepeatStart, IR.RepeatQStart:
465 pc += re.ir[pc].data + IRL!(IR.RepeatStart);
466 break;
467 case IR.RepeatEnd:
468 case IR.RepeatQEnd:
469 if (merge[re.ir[pc + 1].raw+counter].mark(index))
470 {
471 // merged!
472 goto L_backtrack;
473 }
474 //len, step, min, max
475 immutable len = re.ir[pc].data;
476 immutable step = re.ir[pc+2].raw;
477 immutable min = re.ir[pc+3].raw;
478 immutable max = re.ir[pc+4].raw;
479 if (counter < min)
480 {
481 counter += step;
482 pc -= len;
483 }
484 else if (counter < max)
485 {
486 if (re.ir[pc].code == IR.RepeatEnd)
487 {
488 pushState(pc + IRL!(IR.RepeatEnd), counter%step);
489 counter += step;
490 pc -= len;
491 }
492 else
493 {
494 pushState(pc - len, counter + step);
495 counter = counter%step;
496 pc += IRL!(IR.RepeatEnd);
497 }
498 }
499 else
500 {
501 counter = counter%step;
502 pc += IRL!(IR.RepeatEnd);
503 }
504 break;
505 case IR.InfiniteEnd:
506 case IR.InfiniteQEnd:
507 debug(std_regex_matcher) writeln("Infinited nesting:", infiniteNesting);
508 if (merge[re.ir[pc + 1].raw+counter].mark(index))
509 {
510 // merged!
511 goto L_backtrack;
512 }
513 immutable len = re.ir[pc].data;
514 if (re.ir[pc].code == IR.InfiniteEnd)
515 {
516 pushState(pc + IRL!(IR.InfiniteEnd), counter);
517 pc -= len;
518 }
519 else
520 {
521 pushState(pc-len, counter);
522 pc += IRL!(IR.InfiniteEnd);
523 }
524 break;
525 case IR.InfiniteBloomEnd:
526 debug(std_regex_matcher) writeln("Infinited nesting:", infiniteNesting);
527 if (merge[re.ir[pc + 1].raw+counter].mark(index))
528 {
529 // merged!
530 goto L_backtrack;
531 }
532 immutable len = re.ir[pc].data;
533 immutable filterIdx = re.ir[pc+2].raw;
534 if (re.filters[filterIdx][front])
535 {
536 infiniteNesting--;
537 pushState(pc + IRL!(IR.InfiniteBloomEnd), counter);
538 infiniteNesting++;
539 }
540 pc -= len;
541 break;
542 case IR.OrEnd:
543 if (merge[re.ir[pc + 1].raw+counter].mark(index))
544 {
545 // merged!
546 goto L_backtrack;
547 }
548 pc += IRL!(IR.OrEnd);
549 break;
550 case IR.OrStart:
551 pc += IRL!(IR.OrStart);
552 goto case;
553 case IR.Option:
554 immutable len = re.ir[pc].data;
555 if (re.ir[pc+len].code == IR.GotoEndOr)//not a last one
556 {
557 pushState(pc + len + IRL!(IR.Option), counter); //remember 2nd branch
558 }
559 pc += IRL!(IR.Option);
560 break;
561 case IR.GotoEndOr:
562 pc = pc + re.ir[pc].data + IRL!(IR.GotoEndOr);
563 break;
564 case IR.GroupStart:
565 immutable n = re.ir[pc].data;
566 matches[n].begin = index;
567 debug(std_regex_matcher) writefln("IR group #%u starts at %u", n, index);
568 pc += IRL!(IR.GroupStart);
569 break;
570 case IR.GroupEnd:
571 immutable n = re.ir[pc].data;
572 matches[n].end = index;
573 debug(std_regex_matcher) writefln("IR group #%u ends at %u", n, index);
574 pc += IRL!(IR.GroupEnd);
575 break;
576 case IR.LookaheadStart:
577 case IR.NeglookaheadStart:
578 immutable len = re.ir[pc].data;
579 auto save = index;
580 immutable ms = re.ir[pc+1].raw, me = re.ir[pc+2].raw;
581 auto mem = malloc(initialMemory(re))[0 .. initialMemory(re)];
582 scope(exit) free(mem.ptr);
583 static if (Stream.isLoopback)
584 {
585 auto matcher = bwdMatcher(this, mem);
586 }
587 else
588 {
589 auto matcher = fwdMatcher(this, mem);
590 }
591 matcher.matches = matches[ms .. me];
592 matcher.backrefed = backrefed.empty ? matches : backrefed;
593 matcher.re.ir = re.ir[
594 pc+IRL!(IR.LookaheadStart) .. pc+IRL!(IR.LookaheadStart)+len+IRL!(IR.LookaheadEnd)
595 ];
596 immutable match = (matcher.matchImpl() != 0) ^ (re.ir[pc].code == IR.NeglookaheadStart);
597 s.reset(save);
598 next();
599 if (!match)
600 goto L_backtrack;
601 else
602 {
603 pc += IRL!(IR.LookaheadStart)+len+IRL!(IR.LookaheadEnd);
604 }
605 break;
606 case IR.LookbehindStart:
607 case IR.NeglookbehindStart:
608 immutable len = re.ir[pc].data;
609 immutable ms = re.ir[pc+1].raw, me = re.ir[pc+2].raw;
610 auto mem = malloc(initialMemory(re))[0 .. initialMemory(re)];
611 scope(exit) free(mem.ptr);
612 static if (Stream.isLoopback)
613 {
614 alias Matcher = BacktrackingMatcher!(Char, Stream);
615 auto matcher = Matcher(re, s, mem, front, index);
616 }
617 else
618 {
619 alias Matcher = BacktrackingMatcher!(Char, typeof(s.loopBack(index)));
620 auto matcher = Matcher(re, s.loopBack(index), mem);
621 }
622 matcher.matches = matches[ms .. me];
623 matcher.re.ir = re.ir[
624 pc + IRL!(IR.LookbehindStart) .. pc + IRL!(IR.LookbehindStart) + len + IRL!(IR.LookbehindEnd)
625 ];
626 matcher.backrefed = backrefed.empty ? matches : backrefed;
627 immutable match = (matcher.matchImpl() != 0) ^ (re.ir[pc].code == IR.NeglookbehindStart);
628 if (!match)
629 goto L_backtrack;
630 else
631 {
632 pc += IRL!(IR.LookbehindStart)+len+IRL!(IR.LookbehindEnd);
633 }
634 break;
635 case IR.Backref:
636 immutable n = re.ir[pc].data;
637 auto referenced = re.ir[pc].localRef
638 ? s[matches[n].begin .. matches[n].end]
639 : s[backrefed[n].begin .. backrefed[n].end];
640 while (!atEnd && !referenced.empty && front == referenced.front)
641 {
642 next();
643 referenced.popFront();
644 }
645 if (referenced.empty)
646 pc++;
647 else
648 goto L_backtrack;
649 break;
650 case IR.Nop:
651 pc += IRL!(IR.Nop);
652 break;
653 case IR.LookaheadEnd:
654 case IR.NeglookaheadEnd:
655 case IR.LookbehindEnd:
656 case IR.NeglookbehindEnd:
657 case IR.End:
658 // cleanup stale stack blocks if any
659 while (prevStack()) {}
660 return re.ir[pc].data;
661 default:
662 debug printBytecode(re.ir[0..$]);
663 assert(0);
664 L_backtrack:
665 if (!popState())
666 {
667 s.reset(start);
668 return 0;
669 }
670 }
671 }
672 }
673 assert(0);
674 }
675
676 @property size_t stackAvail()
677 {
678 return memory.length - lastState;
679 }
680
681 void stackPush(T)(T val)
682 if (!isDynamicArray!T)
683 {
684 *cast(T*)&memory[lastState] = val;
685 enum delta = (T.sizeof+size_t.sizeof/2)/size_t.sizeof;
686 lastState += delta;
687 debug(std_regex_matcher) writeln("push element SP= ", lastState);
688 }
689
690 void stackPush(T)(T[] val)
691 {
692 static assert(T.sizeof % size_t.sizeof == 0);
693 (cast(T*)&memory[lastState])[0 .. val.length]
694 = val[0..$];
695 lastState += val.length*(T.sizeof/size_t.sizeof);
696 debug(std_regex_matcher) writeln("push array SP= ", lastState);
697 }
698
699 void stackPop(T)(ref T val)
700 if (!isDynamicArray!T)
701 {
702 enum delta = (T.sizeof+size_t.sizeof/2)/size_t.sizeof;
703 lastState -= delta;
704 val = *cast(T*)&memory[lastState];
705 debug(std_regex_matcher) writeln("pop element SP= ", lastState);
706 }
707
708 void stackPop(T)(T[] val)
709 {
710 stackPop(val); // call ref version
711 }
712 void stackPop(T)(ref T[] val)
713 {
714 lastState -= val.length*(T.sizeof/size_t.sizeof);
715 val[0..$] = (cast(T*)&memory[lastState])[0 .. val.length];
716 debug(std_regex_matcher) writeln("pop array SP= ", lastState);
717 }
718
719 static if (!CTregex)
720 {
721 //helper function, saves engine state
722 void pushState(uint pc, uint counter)
723 {
724 if (stateSize + 2 * matches.length > stackAvail)
725 {
726 newStack();
727 }
728 *cast(State*)&memory[lastState] =
729 State(index, pc, counter, infiniteNesting);
730 lastState += stateSize;
731 memory[lastState .. lastState + 2 * matches.length] = (cast(size_t[]) matches)[];
732 lastState += 2*matches.length;
733 debug(std_regex_matcher)
734 writefln("Saved(pc=%s) front: %s src: %s",
735 pc, front, s[index .. s.lastIndex]);
736 }
737
738 //helper function, restores engine state
739 bool popState()
740 {
741 if (!lastState && !prevStack())
742 return false;
743 lastState -= 2*matches.length;
744 auto pm = cast(size_t[]) matches;
745 pm[] = memory[lastState .. lastState + 2 * matches.length];
746 lastState -= stateSize;
747 State* state = cast(State*)&memory[lastState];
748 index = state.index;
749 pc = state.pc;
750 counter = state.counter;
751 infiniteNesting = state.infiniteNesting;
752 debug(std_regex_matcher)
753 {
754 writefln("Restored matches", front, s[index .. s.lastIndex]);
755 foreach (i, m; matches)
756 writefln("Sub(%d) : %s..%s", i, m.begin, m.end);
757 }
758 s.reset(index);
759 next();
760 debug(std_regex_matcher)
761 writefln("Backtracked (pc=%s) front: %s src: %s",
762 pc, front, s[index .. s.lastIndex]);
763 return true;
764 }
765 }
766 }
767 }
768
769 //very shitty string formatter, $$ replaced with next argument converted to string
770 @trusted string ctSub( U...)(string format, U args)
771 {
772 import std.conv : to;
773 bool seenDollar;
774 foreach (i, ch; format)
775 {
776 if (ch == '$')
777 {
778 if (seenDollar)
779 {
780 static if (args.length > 0)
781 {
782 return format[0 .. i - 1] ~ to!string(args[0])
783 ~ ctSub(format[i + 1 .. $], args[1 .. $]);
784 }
785 else
786 assert(0);
787 }
788 else
789 seenDollar = true;
790 }
791 else
792 seenDollar = false;
793
794 }
795 return format;
796 }
797
798 alias Sequence(int B, int E) = staticIota!(B, E);
799
800 struct CtContext
801 {
802 import std.conv : to, text;
803 //dirty flags
804 bool counter;
805 //to mark the portion of matches to save
806 int match, total_matches;
807 int reserved;
808 CodepointSet[] charsets;
809
810
811 //state of codegenerator
812 static struct CtState
813 {
814 string code;
815 int addr;
816 }
817
818 this(Char)(Regex!Char re)
819 {
820 match = 1;
821 reserved = 1; //first match is skipped
822 total_matches = re.ngroup;
823 charsets = re.charsets;
824 }
825
826 CtContext lookaround(uint s, uint e)
827 {
828 CtContext ct;
829 ct.total_matches = e - s;
830 ct.match = 1;
831 return ct;
832 }
833
834 //restore state having current context
835 string restoreCode()
836 {
837 string text;
838 //stack is checked in L_backtrack
839 text ~= counter
840 ? "
841 stackPop(counter);"
842 : "
843 counter = 0;";
844 if (match < total_matches)
845 {
846 text ~= ctSub("
847 stackPop(matches[$$..$$]);", reserved, match);
848 text ~= ctSub("
849 matches[$$..$] = typeof(matches[0]).init;", match);
850 }
851 else
852 text ~= ctSub("
853 stackPop(matches[$$..$]);", reserved);
854 return text;
855 }
856
857 //save state having current context
858 string saveCode(uint pc, string count_expr="counter")
859 {
860 string text = ctSub("
861 if (stackAvail < $$*(Group!(DataIndex)).sizeof/size_t.sizeof + $$)
862 {
863 newStack();
864 }", match - reserved, cast(int) counter + 2);
865 if (match < total_matches)
866 text ~= ctSub("
867 stackPush(matches[$$..$$]);", reserved, match);
868 else
869 text ~= ctSub("
870 stackPush(matches[$$..$]);", reserved);
871 text ~= counter ? ctSub("
872 stackPush($$);", count_expr) : "";
873 text ~= ctSub("
874 stackPush(index); stackPush($$); \n", pc);
875 return text;
876 }
877
878 //
879 CtState ctGenBlock(Bytecode[] ir, int addr)
880 {
881 CtState result;
882 result.addr = addr;
883 while (!ir.empty)
884 {
885 auto n = ctGenGroup(ir, result.addr);
886 result.code ~= n.code;
887 result.addr = n.addr;
888 }
889 return result;
890 }
891
892 //
893 CtState ctGenGroup(ref Bytecode[] ir, int addr)
894 {
895 import std.algorithm.comparison : max;
896 auto bailOut = "goto L_backtrack;";
897 auto nextInstr = ctSub("goto case $$;", addr+1);
898 CtState r;
899 assert(!ir.empty);
900 switch (ir[0].code)
901 {
902 case IR.InfiniteStart, IR.InfiniteBloomStart,IR.InfiniteQStart, IR.RepeatStart, IR.RepeatQStart:
903 immutable infLoop =
904 ir[0].code == IR.InfiniteStart || ir[0].code == IR.InfiniteQStart ||
905 ir[0].code == IR.InfiniteBloomStart;
906
907 counter = counter ||
908 ir[0].code == IR.RepeatStart || ir[0].code == IR.RepeatQStart;
909 immutable len = ir[0].data;
910 auto nir = ir[ir[0].length .. ir[0].length+len];
911 r = ctGenBlock(nir, addr+1);
912 //start/end codegen
913 //r.addr is at last test+ jump of loop, addr+1 is body of loop
914 nir = ir[ir[0].length + len .. $];
915 r.code = ctGenFixupCode(ir[0 .. ir[0].length], addr, r.addr) ~ r.code;
916 r.code ~= ctGenFixupCode(nir, r.addr, addr+1);
917 r.addr += 2; //account end instruction + restore state
918 ir = nir;
919 break;
920 case IR.OrStart:
921 immutable len = ir[0].data;
922 auto nir = ir[ir[0].length .. ir[0].length+len];
923 r = ctGenAlternation(nir, addr);
924 ir = ir[ir[0].length + len .. $];
925 assert(ir[0].code == IR.OrEnd);
926 ir = ir[ir[0].length..$];
927 break;
928 case IR.LookaheadStart:
929 case IR.NeglookaheadStart:
930 case IR.LookbehindStart:
931 case IR.NeglookbehindStart:
932 immutable len = ir[0].data;
933 immutable behind = ir[0].code == IR.LookbehindStart || ir[0].code == IR.NeglookbehindStart;
934 immutable negative = ir[0].code == IR.NeglookaheadStart || ir[0].code == IR.NeglookbehindStart;
935 string fwdType = "typeof(fwdMatcher(matcher, []))";
936 string bwdType = "typeof(bwdMatcher(matcher, []))";
937 string fwdCreate = "fwdMatcher(matcher, mem)";
938 string bwdCreate = "bwdMatcher(matcher, mem)";
939 immutable start = IRL!(IR.LookbehindStart);
940 immutable end = IRL!(IR.LookbehindStart)+len+IRL!(IR.LookaheadEnd);
941 CtContext context = lookaround(ir[1].raw, ir[2].raw); //split off new context
942 auto slice = ir[start .. end];
943 r.code ~= ctSub(`
944 case $$: //fake lookaround "atom"
945 static if (typeof(matcher.s).isLoopback)
946 alias Lookaround = $$;
947 else
948 alias Lookaround = $$;
949 static bool matcher_$$(ref Lookaround matcher) @trusted
950 {
951 //(neg)lookaround piece start
952 $$
953 //(neg)lookaround piece ends
954 }
955 auto save = index;
956 auto mem = malloc(initialMemory(re))[0 .. initialMemory(re)];
957 scope(exit) free(mem.ptr);
958 static if (typeof(matcher.s).isLoopback)
959 auto lookaround = $$;
960 else
961 auto lookaround = $$;
962 lookaround.matches = matches[$$..$$];
963 lookaround.backrefed = backrefed.empty ? matches : backrefed;
964 lookaround.nativeFn = &matcher_$$; //hookup closure's binary code
965 int match = $$;
966 s.reset(save);
967 next();
968 if (match)
969 $$
970 else
971 $$`, addr,
972 behind ? fwdType : bwdType, behind ? bwdType : fwdType,
973 addr, context.ctGenRegEx(slice),
974 behind ? fwdCreate : bwdCreate, behind ? bwdCreate : fwdCreate,
975 ir[1].raw, ir[2].raw, //start - end of matches slice
976 addr,
977 negative ? "!lookaround.matchImpl()" : "lookaround.matchImpl()",
978 nextInstr, bailOut);
979 ir = ir[end .. $];
980 r.addr = addr + 1;
981 break;
982 case IR.LookaheadEnd: case IR.NeglookaheadEnd:
983 case IR.LookbehindEnd: case IR.NeglookbehindEnd:
984 ir = ir[IRL!(IR.LookaheadEnd) .. $];
985 r.addr = addr;
986 break;
987 default:
988 assert(ir[0].isAtom, text(ir[0].mnemonic));
989 r = ctGenAtom(ir, addr);
990 }
991 return r;
992 }
993
994 //generate source for bytecode contained in OrStart ... OrEnd
995 CtState ctGenAlternation(Bytecode[] ir, int addr)
996 {
997 CtState[] pieces;
998 CtState r;
999 enum optL = IRL!(IR.Option);
1000 for (;;)
1001 {
1002 assert(ir[0].code == IR.Option);
1003 auto len = ir[0].data;
1004 if (optL+len < ir.length && ir[optL+len].code == IR.Option)//not a last option
1005 {
1006 auto nir = ir[optL .. optL+len-IRL!(IR.GotoEndOr)];
1007 r = ctGenBlock(nir, addr+2);//space for Option + restore state
1008 //r.addr+1 to account GotoEndOr at end of branch
1009 r.code = ctGenFixupCode(ir[0 .. ir[0].length], addr, r.addr+1) ~ r.code;
1010 addr = r.addr+1;//leave space for GotoEndOr
1011 pieces ~= r;
1012 ir = ir[optL + len .. $];
1013 }
1014 else
1015 {
1016 pieces ~= ctGenBlock(ir[optL..$], addr);
1017 addr = pieces[$-1].addr;
1018 break;
1019 }
1020 }
1021 r = pieces[0];
1022 for (uint i = 1; i < pieces.length; i++)
1023 {
1024 r.code ~= ctSub(`
1025 case $$:
1026 goto case $$; `, pieces[i-1].addr, addr);
1027 r.code ~= pieces[i].code;
1028 }
1029 r.addr = addr;
1030 return r;
1031 }
1032
1033 // generate fixup code for instruction in ir,
1034 // fixup means it has an alternative way for control flow
1035 string ctGenFixupCode(Bytecode[] ir, int addr, int fixup)
1036 {
1037 return ctGenFixupCode(ir, addr, fixup); // call ref Bytecode[] version
1038 }
1039 string ctGenFixupCode(ref Bytecode[] ir, int addr, int fixup)
1040 {
1041 string r;
1042 string testCode;
1043 r = ctSub(`
1044 case $$: debug(std_regex_matcher) writeln("#$$");`,
1045 addr, addr);
1046 switch (ir[0].code)
1047 {
1048 case IR.InfiniteStart, IR.InfiniteQStart, IR.InfiniteBloomStart:
1049 r ~= ctSub( `
1050 goto case $$;`, fixup);
1051 ir = ir[ir[0].length..$];
1052 break;
1053 case IR.InfiniteEnd:
1054 testCode = ctQuickTest(ir[IRL!(IR.InfiniteEnd) .. $],addr + 1);
1055 r ~= ctSub( `
1056 if (merge[$$+counter].mark(index))
1057 {
1058 // merged!
1059 goto L_backtrack;
1060 }
1061
1062 $$
1063 {
1064 $$
1065 }
1066 goto case $$;
1067 case $$: //restore state and go out of loop
1068 $$
1069 goto case;`, ir[1].raw, testCode, saveCode(addr+1), fixup,
1070 addr+1, restoreCode());
1071 ir = ir[ir[0].length..$];
1072 break;
1073 case IR.InfiniteBloomEnd:
1074 //TODO: check bloom filter and skip on failure
1075 testCode = ctQuickTest(ir[IRL!(IR.InfiniteBloomEnd) .. $],addr + 1);
1076 r ~= ctSub( `
1077 if (merge[$$+counter].mark(index))
1078 {
1079 // merged!
1080 goto L_backtrack;
1081 }
1082
1083 $$
1084 {
1085 $$
1086 }
1087 goto case $$;
1088 case $$: //restore state and go out of loop
1089 $$
1090 goto case;`, ir[1].raw, testCode, saveCode(addr+1), fixup,
1091 addr+1, restoreCode());
1092 ir = ir[ir[0].length..$];
1093 break;
1094 case IR.InfiniteQEnd:
1095 testCode = ctQuickTest(ir[IRL!(IR.InfiniteEnd) .. $],addr + 1);
1096 auto altCode = testCode.length ? ctSub("else goto case $$;", fixup) : "";
1097 r ~= ctSub( `
1098 if (merge[$$+counter].mark(index))
1099 {
1100 // merged!
1101 goto L_backtrack;
1102 }
1103
1104 $$
1105 {
1106 $$
1107 goto case $$;
1108 }
1109 $$
1110 case $$://restore state and go inside loop
1111 $$
1112 goto case $$;`, ir[1].raw,
1113 testCode, saveCode(addr+1), addr+2, altCode,
1114 addr+1, restoreCode(), fixup);
1115 ir = ir[ir[0].length..$];
1116 break;
1117 case IR.RepeatStart, IR.RepeatQStart:
1118 r ~= ctSub( `
1119 goto case $$;`, fixup);
1120 ir = ir[ir[0].length..$];
1121 break;
1122 case IR.RepeatEnd, IR.RepeatQEnd:
1123 //len, step, min, max
1124 immutable len = ir[0].data;
1125 immutable step = ir[2].raw;
1126 immutable min = ir[3].raw;
1127 immutable max = ir[4].raw;
1128 r ~= ctSub(`
1129 if (merge[$$+counter].mark(index))
1130 {
1131 // merged!
1132 goto L_backtrack;
1133 }
1134 if (counter < $$)
1135 {
1136 debug(std_regex_matcher) writeln("RepeatEnd min case pc=", $$);
1137 counter += $$;
1138 goto case $$;
1139 }`, ir[1].raw, min, addr, step, fixup);
1140 if (ir[0].code == IR.RepeatEnd)
1141 {
1142 string counter_expr = ctSub("counter % $$", step);
1143 r ~= ctSub(`
1144 else if (counter < $$)
1145 {
1146 $$
1147 counter += $$;
1148 goto case $$;
1149 }`, max, saveCode(addr+1, counter_expr), step, fixup);
1150 }
1151 else
1152 {
1153 string counter_expr = ctSub("counter % $$", step);
1154 r ~= ctSub(`
1155 else if (counter < $$)
1156 {
1157 $$
1158 counter = counter % $$;
1159 goto case $$;
1160 }`, max, saveCode(addr+1,counter_expr), step, addr+2);
1161 }
1162 r ~= ctSub(`
1163 else
1164 {
1165 counter = counter % $$;
1166 goto case $$;
1167 }
1168 case $$: //restore state
1169 $$
1170 goto case $$;`, step, addr+2, addr+1, restoreCode(),
1171 ir[0].code == IR.RepeatEnd ? addr+2 : fixup );
1172 ir = ir[ir[0].length..$];
1173 break;
1174 case IR.Option:
1175 r ~= ctSub( `
1176 {
1177 $$
1178 }
1179 goto case $$;
1180 case $$://restore thunk to go to the next group
1181 $$
1182 goto case $$;`, saveCode(addr+1), addr+2,
1183 addr+1, restoreCode(), fixup);
1184 ir = ir[ir[0].length..$];
1185 break;
1186 default:
1187 assert(0, text(ir[0].mnemonic));
1188 }
1189 return r;
1190 }
1191
1192
1193 string ctQuickTest(Bytecode[] ir, int id)
1194 {
1195 uint pc = 0;
1196 while (pc < ir.length && ir[pc].isAtom)
1197 {
1198 if (ir[pc].code == IR.GroupStart || ir[pc].code == IR.GroupEnd)
1199 {
1200 pc++;
1201 }
1202 else if (ir[pc].code == IR.Backref)
1203 break;
1204 else
1205 {
1206 auto code = ctAtomCode(ir[pc..$], -1);
1207 return ctSub(`
1208 int test_$$()
1209 {
1210 $$ //$$
1211 }
1212 if (test_$$() >= 0)`, id, code.ptr ? code : "return 0;",
1213 ir[pc].mnemonic, id);
1214 }
1215 }
1216 return "";
1217 }
1218
1219 //process & generate source for simple bytecodes at front of ir using address addr
1220 CtState ctGenAtom(ref Bytecode[] ir, int addr)
1221 {
1222 CtState result;
1223 result.code = ctAtomCode(ir, addr);
1224 ir.popFrontN(ir[0].code == IR.OrChar ? ir[0].sequence : ir[0].length);
1225 result.addr = addr + 1;
1226 return result;
1227 }
1228
1229 //D code for atom at ir using address addr, addr < 0 means quickTest
1230 string ctAtomCode(Bytecode[] ir, int addr)
1231 {
1232 string code;
1233 string bailOut, nextInstr;
1234 if (addr < 0)
1235 {
1236 bailOut = "return -1;";
1237 nextInstr = "return 0;";
1238 }
1239 else
1240 {
1241 bailOut = "goto L_backtrack;";
1242 nextInstr = ctSub("goto case $$;", addr+1);
1243 code ~= ctSub( `
1244 case $$: debug(std_regex_matcher) writeln("#$$");
1245 `, addr, addr);
1246 }
1247 switch (ir[0].code)
1248 {
1249 case IR.OrChar://assumes IRL!(OrChar) == 1
1250 code ~= ctSub(`
1251 if (atEnd)
1252 $$`, bailOut);
1253 immutable len = ir[0].sequence;
1254 for (uint i = 0; i < len; i++)
1255 {
1256 code ~= ctSub( `
1257 if (front == $$)
1258 {
1259 $$
1260 $$
1261 }`, ir[i].data, addr >= 0 ? "next();" :"", nextInstr);
1262 }
1263 code ~= ctSub( `
1264 $$`, bailOut);
1265 break;
1266 case IR.Char:
1267 code ~= ctSub( `
1268 if (atEnd || front != $$)
1269 $$
1270 $$
1271 $$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr);
1272 break;
1273 case IR.Any:
1274 code ~= ctSub( `
1275 if (atEnd || (!(re.flags & RegexOption.singleline)
1276 && (front == '\r' || front == '\n')))
1277 $$
1278 $$
1279 $$`, bailOut, addr >= 0 ? "next();" :"",nextInstr);
1280 break;
1281 case IR.CodepointSet:
1282 if (charsets.length)
1283 {
1284 string name = `func_`~to!string(addr+1);
1285 string funcCode = charsets[ir[0].data].toSourceCode(name);
1286 code ~= ctSub( `
1287 static $$
1288 if (atEnd || !$$(front))
1289 $$
1290 $$
1291 $$`, funcCode, name, bailOut, addr >= 0 ? "next();" :"", nextInstr);
1292 }
1293 else
1294 code ~= ctSub( `
1295 if (atEnd || !re.charsets[$$].scanFor(front))
1296 $$
1297 $$
1298 $$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr);
1299 break;
1300 case IR.Trie:
1301 if (charsets.length && charsets[ir[0].data].byInterval.length <= 8)
1302 goto case IR.CodepointSet;
1303 code ~= ctSub( `
1304 if (atEnd || !re.matchers[$$][front])
1305 $$
1306 $$
1307 $$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr);
1308 break;
1309 case IR.Wordboundary:
1310 code ~= ctSub( `
1311 dchar back;
1312 DataIndex bi;
1313 if (atStart && wordMatcher[front])
1314 {
1315 $$
1316 }
1317 else if (atEnd && s.loopBack(index).nextChar(back, bi)
1318 && wordMatcher[back])
1319 {
1320 $$
1321 }
1322 else if (s.loopBack(index).nextChar(back, bi))
1323 {
1324 bool af = wordMatcher[front];
1325 bool ab = wordMatcher[back];
1326 if (af ^ ab)
1327 {
1328 $$
1329 }
1330 }
1331 $$`, nextInstr, nextInstr, nextInstr, bailOut);
1332 break;
1333 case IR.Notwordboundary:
1334 code ~= ctSub( `
1335 dchar back;
1336 DataIndex bi;
1337 //at start & end of input
1338 if (atStart && wordMatcher[front])
1339 $$
1340 else if (atEnd && s.loopBack(index).nextChar(back, bi)
1341 && wordMatcher[back])
1342 $$
1343 else if (s.loopBack(index).nextChar(back, bi))
1344 {
1345 bool af = wordMatcher[front];
1346 bool ab = wordMatcher[back];
1347 if (af ^ ab)
1348 $$
1349 }
1350 $$`, bailOut, bailOut, bailOut, nextInstr);
1351
1352 break;
1353 case IR.Bol:
1354 code ~= ctSub(`
1355 dchar back;
1356 DataIndex bi;
1357 if (atStart || (s.loopBack(index).nextChar(back,bi)
1358 && endOfLine(back, front == '\n')))
1359 {
1360 debug(std_regex_matcher) writeln("BOL matched");
1361 $$
1362 }
1363 else
1364 $$`, nextInstr, bailOut);
1365
1366 break;
1367 case IR.Bof:
1368 code ~= ctSub(`
1369 if (atStart)
1370 {
1371 debug(std_regex_matcher) writeln("BOF matched");
1372 $$
1373 }
1374 else
1375 $$`, nextInstr, bailOut);
1376 break;
1377 case IR.Eol:
1378 code ~= ctSub(`
1379 dchar back;
1380 DataIndex bi;
1381 debug(std_regex_matcher) writefln("EOL (front 0x%x) %s", front, s[index .. s.lastIndex]);
1382 //no matching inside \r\n
1383 if (atEnd || (endOfLine(front, s.loopBack(index).nextChar(back,bi)
1384 && back == '\r')))
1385 {
1386 debug(std_regex_matcher) writeln("EOL matched");
1387 $$
1388 }
1389 else
1390 $$`, nextInstr, bailOut);
1391 break;
1392 case IR.Eof:
1393 code ~= ctSub(`
1394 if (atEnd)
1395 {
1396 debug(std_regex_matcher) writeln("BOF matched");
1397 $$
1398 }
1399 else
1400 $$`, nextInstr, bailOut);
1401 break;
1402 case IR.GroupStart:
1403 code ~= ctSub(`
1404 matches[$$].begin = index;
1405 $$`, ir[0].data, nextInstr);
1406 match = ir[0].data+1;
1407 break;
1408 case IR.GroupEnd:
1409 code ~= ctSub(`
1410 matches[$$].end = index;
1411 $$`, ir[0].data, nextInstr);
1412 break;
1413 case IR.Backref:
1414 string mStr = "auto referenced = ";
1415 mStr ~= ir[0].localRef
1416 ? ctSub("s[matches[$$].begin .. matches[$$].end];",
1417 ir[0].data, ir[0].data)
1418 : ctSub("s[backrefed[$$].begin .. backrefed[$$].end];",
1419 ir[0].data, ir[0].data);
1420 code ~= ctSub( `
1421 $$
1422 while (!atEnd && !referenced.empty && front == referenced.front)
1423 {
1424 next();
1425 referenced.popFront();
1426 }
1427 if (referenced.empty)
1428 $$
1429 else
1430 $$`, mStr, nextInstr, bailOut);
1431 break;
1432 case IR.Nop:
1433 case IR.End:
1434 break;
1435 default:
1436 assert(0, text(ir[0].mnemonic, " is not supported yet"));
1437 }
1438 return code;
1439 }
1440
1441 //generate D code for the whole regex
1442 public string ctGenRegEx(Bytecode[] ir)
1443 {
1444 auto bdy = ctGenBlock(ir, 0);
1445 auto r = `
1446 import core.stdc.stdlib;
1447 with(matcher)
1448 {
1449 pc = 0;
1450 counter = 0;
1451 lastState = 0;
1452 matches[] = Group!DataIndex.init;
1453 auto start = s._index;`;
1454 r ~= `
1455 goto StartLoop;
1456 debug(std_regex_matcher) writeln("Try CT matching starting at ",s[index .. s.lastIndex]);
1457 L_backtrack:
1458 if (lastState || prevStack())
1459 {
1460 stackPop(pc);
1461 stackPop(index);
1462 s.reset(index);
1463 next();
1464 }
1465 else
1466 {
1467 s.reset(start);
1468 return false;
1469 }
1470 StartLoop:
1471 switch (pc)
1472 {
1473 `;
1474 r ~= bdy.code;
1475 r ~= ctSub(`
1476 case $$: break;`,bdy.addr);
1477 r ~= `
1478 default:
1479 assert(0);
1480 }
1481 // cleanup stale stack blocks
1482 while (prevStack()) {}
1483 return true;
1484 }
1485 `;
1486 return r;
1487 }
1488
1489 }
1490
1491 string ctGenRegExCode(Char)(Regex!Char re)
1492 {
1493 auto context = CtContext(re);
1494 return context.ctGenRegEx(re.ir);
1495 }
1496