1181254a7Smrg //Written in the D programming language
2181254a7Smrg /*
3181254a7Smrg Regular expression pattern parser.
4181254a7Smrg */
5181254a7Smrg module std.regex.internal.parser;
6181254a7Smrg
7*b1e83836Smrg import std.regex.internal.ir;
8181254a7Smrg import std.range.primitives, std.uni, std.meta,
9181254a7Smrg std.traits, std.typecons, std.exception;
10*b1e83836Smrg static import std.ascii;
11181254a7Smrg
12181254a7Smrg // package relevant info from parser into a regex object
makeRegex(S,CG)13181254a7Smrg auto makeRegex(S, CG)(Parser!(S, CG) p)
14181254a7Smrg {
15*b1e83836Smrg import std.regex.internal.backtracking : BacktrackingMatcher;
16*b1e83836Smrg import std.regex.internal.thompson : ThompsonMatcher;
17*b1e83836Smrg import std.algorithm.searching : canFind;
18*b1e83836Smrg alias Char = BasicElementOf!S;
19*b1e83836Smrg Regex!Char re;
20181254a7Smrg auto g = p.g;
21181254a7Smrg with(re)
22181254a7Smrg {
23181254a7Smrg ir = g.ir;
24181254a7Smrg dict = g.dict;
25181254a7Smrg ngroup = g.ngroup;
26181254a7Smrg maxCounterDepth = g.counterDepth;
27181254a7Smrg flags = p.re_flags;
28181254a7Smrg charsets = g.charsets;
29181254a7Smrg matchers = g.matchers;
30181254a7Smrg backrefed = g.backrefed;
31*b1e83836Smrg re.pattern = p.origin.idup;
32181254a7Smrg re.postprocess();
33*b1e83836Smrg // check if we have backreferences, if so - use backtracking
34*b1e83836Smrg if (__ctfe) factory = null; // allows us to use the awful enum re = regex(...);
35*b1e83836Smrg else if (re.backrefed.canFind!"a != 0")
36*b1e83836Smrg factory = new RuntimeFactory!(BacktrackingMatcher, Char);
37*b1e83836Smrg else
38*b1e83836Smrg factory = new RuntimeFactory!(ThompsonMatcher, Char);
39181254a7Smrg debug(std_regex_parser)
40181254a7Smrg {
41181254a7Smrg __ctfe || print();
42181254a7Smrg }
43181254a7Smrg //@@@BUG@@@ (not reduced)
44181254a7Smrg //somehow just using validate _collides_ with std.utf.validate (!)
45181254a7Smrg version (assert) re.validateRe();
46181254a7Smrg }
47181254a7Smrg return re;
48181254a7Smrg }
49181254a7Smrg
50181254a7Smrg // helper for unittest
51181254a7Smrg auto makeRegex(S)(S arg)
52181254a7Smrg if (isSomeString!S)
53181254a7Smrg {
54181254a7Smrg return makeRegex(Parser!(S, CodeGen)(arg, ""));
55181254a7Smrg }
56181254a7Smrg
57181254a7Smrg @system unittest
58181254a7Smrg {
59181254a7Smrg import std.algorithm.comparison : equal;
60181254a7Smrg auto re = makeRegex(`(?P<name>\w+) = (?P<var>\d+)`);
61181254a7Smrg auto nc = re.namedCaptures;
62181254a7Smrg static assert(isRandomAccessRange!(typeof(nc)));
63181254a7Smrg assert(!nc.empty);
64181254a7Smrg assert(nc.length == 2);
65181254a7Smrg assert(nc.equal(["name", "var"]));
66181254a7Smrg assert(nc[0] == "name");
67181254a7Smrg assert(nc[1..$].equal(["var"]));
68181254a7Smrg
69181254a7Smrg re = makeRegex(`(\w+) (?P<named>\w+) (\w+)`);
70181254a7Smrg nc = re.namedCaptures;
71181254a7Smrg assert(nc.length == 1);
72181254a7Smrg assert(nc[0] == "named");
73181254a7Smrg assert(nc.front == "named");
74181254a7Smrg assert(nc.back == "named");
75181254a7Smrg
76181254a7Smrg re = makeRegex(`(\w+) (\w+)`);
77181254a7Smrg nc = re.namedCaptures;
78181254a7Smrg assert(nc.empty);
79181254a7Smrg
80181254a7Smrg re = makeRegex(`(?P<year>\d{4})/(?P<month>\d{2})/(?P<day>\d{2})/`);
81181254a7Smrg nc = re.namedCaptures;
82181254a7Smrg auto cp = nc.save;
83181254a7Smrg assert(nc.equal(cp));
84181254a7Smrg nc.popFront();
85181254a7Smrg assert(nc.equal(cp[1..$]));
86181254a7Smrg nc.popBack();
87181254a7Smrg assert(nc.equal(cp[1 .. $ - 1]));
88181254a7Smrg }
89181254a7Smrg
90181254a7Smrg
reverseBytecode()91181254a7Smrg @trusted void reverseBytecode()(Bytecode[] code)
92181254a7Smrg {
93181254a7Smrg Bytecode[] rev = new Bytecode[code.length];
94181254a7Smrg uint revPc = cast(uint) rev.length;
95181254a7Smrg Stack!(Tuple!(uint, uint, uint)) stack;
96181254a7Smrg uint start = 0;
97181254a7Smrg uint end = cast(uint) code.length;
98181254a7Smrg for (;;)
99181254a7Smrg {
100181254a7Smrg for (uint pc = start; pc < end; )
101181254a7Smrg {
102181254a7Smrg immutable len = code[pc].length;
103181254a7Smrg if (code[pc].code == IR.GotoEndOr)
104181254a7Smrg break; //pick next alternation branch
105181254a7Smrg if (code[pc].isAtom)
106181254a7Smrg {
107181254a7Smrg rev[revPc - len .. revPc] = code[pc .. pc + len];
108181254a7Smrg revPc -= len;
109181254a7Smrg pc += len;
110181254a7Smrg }
111181254a7Smrg else if (code[pc].isStart || code[pc].isEnd)
112181254a7Smrg {
113181254a7Smrg //skip over other embedded lookbehinds they are reversed
114181254a7Smrg if (code[pc].code == IR.LookbehindStart
115181254a7Smrg || code[pc].code == IR.NeglookbehindStart)
116181254a7Smrg {
117181254a7Smrg immutable blockLen = len + code[pc].data
118181254a7Smrg + code[pc].pairedLength;
119181254a7Smrg rev[revPc - blockLen .. revPc] = code[pc .. pc + blockLen];
120181254a7Smrg pc += blockLen;
121181254a7Smrg revPc -= blockLen;
122181254a7Smrg continue;
123181254a7Smrg }
124181254a7Smrg immutable second = code[pc].indexOfPair(pc);
125181254a7Smrg immutable secLen = code[second].length;
126181254a7Smrg rev[revPc - secLen .. revPc] = code[second .. second + secLen];
127181254a7Smrg revPc -= secLen;
128181254a7Smrg if (code[pc].code == IR.OrStart)
129181254a7Smrg {
130181254a7Smrg //we pass len bytes forward, but secLen in reverse
131181254a7Smrg immutable revStart = revPc - (second + len - secLen - pc);
132181254a7Smrg uint r = revStart;
133181254a7Smrg uint i = pc + IRL!(IR.OrStart);
134181254a7Smrg while (code[i].code == IR.Option)
135181254a7Smrg {
136181254a7Smrg if (code[i - 1].code != IR.OrStart)
137181254a7Smrg {
138181254a7Smrg assert(code[i - 1].code == IR.GotoEndOr);
139181254a7Smrg rev[r - 1] = code[i - 1];
140181254a7Smrg }
141181254a7Smrg rev[r] = code[i];
142181254a7Smrg auto newStart = i + IRL!(IR.Option);
143181254a7Smrg auto newEnd = newStart + code[i].data;
144181254a7Smrg auto newRpc = r + code[i].data + IRL!(IR.Option);
145181254a7Smrg if (code[newEnd].code != IR.OrEnd)
146181254a7Smrg {
147181254a7Smrg newRpc--;
148181254a7Smrg }
149181254a7Smrg stack.push(tuple(newStart, newEnd, newRpc));
150181254a7Smrg r += code[i].data + IRL!(IR.Option);
151181254a7Smrg i += code[i].data + IRL!(IR.Option);
152181254a7Smrg }
153181254a7Smrg pc = i;
154181254a7Smrg revPc = revStart;
155181254a7Smrg assert(code[pc].code == IR.OrEnd);
156181254a7Smrg }
157181254a7Smrg else
158181254a7Smrg pc += len;
159181254a7Smrg }
160181254a7Smrg }
161181254a7Smrg if (stack.empty)
162181254a7Smrg break;
163181254a7Smrg start = stack.top[0];
164181254a7Smrg end = stack.top[1];
165181254a7Smrg revPc = stack.top[2];
166181254a7Smrg stack.pop();
167181254a7Smrg }
168181254a7Smrg code[] = rev[];
169181254a7Smrg }
170181254a7Smrg
171181254a7Smrg struct CodeGen
172181254a7Smrg {
173181254a7Smrg Bytecode[] ir; // resulting bytecode
174181254a7Smrg Stack!(uint) fixupStack; // stack of opened start instructions
175181254a7Smrg NamedGroup[] dict; // maps name -> user group number
176181254a7Smrg Stack!(uint) groupStack; // stack of current number of group
177181254a7Smrg uint nesting = 0; // group nesting level and repetitions step
178181254a7Smrg uint lookaroundNest = 0; // nesting of lookaround
179181254a7Smrg uint counterDepth = 0; // current depth of nested counted repetitions
180181254a7Smrg CodepointSet[] charsets; // sets for char classes
181181254a7Smrg const(CharMatcher)[] matchers; // matchers for char classes
182181254a7Smrg uint[] backrefed; // bitarray for groups refered by backref
183181254a7Smrg uint ngroup; // final number of groups (of all patterns)
184181254a7Smrg
startCodeGen185181254a7Smrg void start(uint length)
186181254a7Smrg {
187181254a7Smrg if (!__ctfe)
188181254a7Smrg ir.reserve((length*5+2)/4);
189181254a7Smrg fixupStack.push(0);
190181254a7Smrg groupStack.push(1);//0 - whole match
191181254a7Smrg }
192181254a7Smrg
193181254a7Smrg //mark referenced groups for latter processing
markBackrefCodeGen194181254a7Smrg void markBackref(uint n)
195181254a7Smrg {
196181254a7Smrg if (n/32 >= backrefed.length)
197181254a7Smrg backrefed.length = n/32 + 1;
198181254a7Smrg backrefed[n / 32] |= 1 << (n & 31);
199181254a7Smrg }
200181254a7Smrg
isOpenGroupCodeGen201181254a7Smrg bool isOpenGroup(uint n)
202181254a7Smrg {
203181254a7Smrg import std.algorithm.searching : canFind;
204181254a7Smrg // walk the fixup stack and see if there are groups labeled 'n'
205181254a7Smrg // fixup '0' is reserved for alternations
206181254a7Smrg return fixupStack.data[1..$].
207181254a7Smrg canFind!(fix => ir[fix].code == IR.GroupStart && ir[fix].data == n)();
208181254a7Smrg }
209181254a7Smrg
putCodeGen210181254a7Smrg void put(Bytecode code)
211181254a7Smrg {
212181254a7Smrg enforce(ir.length < maxCompiledLength,
213181254a7Smrg "maximum compiled pattern length is exceeded");
214181254a7Smrg ir ~= code;
215181254a7Smrg }
216181254a7Smrg
putRawCodeGen217181254a7Smrg void putRaw(uint number)
218181254a7Smrg {
219181254a7Smrg enforce(ir.length < maxCompiledLength,
220181254a7Smrg "maximum compiled pattern length is exceeded");
221181254a7Smrg ir ~= Bytecode.fromRaw(number);
222181254a7Smrg }
223181254a7Smrg
224181254a7Smrg //try to generate optimal IR code for this CodepointSet
charsetToIrCodeGen225181254a7Smrg @trusted void charsetToIr(CodepointSet set)
226181254a7Smrg {//@@@BUG@@@ writeln is @system
227181254a7Smrg uint chars = cast(uint) set.length;
228181254a7Smrg if (chars < Bytecode.maxSequence)
229181254a7Smrg {
230181254a7Smrg switch (chars)
231181254a7Smrg {
232181254a7Smrg case 1:
233181254a7Smrg put(Bytecode(IR.Char, set.byCodepoint.front));
234181254a7Smrg break;
235181254a7Smrg case 0:
236181254a7Smrg throw new RegexException("empty CodepointSet not allowed");
237181254a7Smrg default:
238181254a7Smrg foreach (ch; set.byCodepoint)
239181254a7Smrg put(Bytecode(IR.OrChar, ch, chars));
240181254a7Smrg }
241181254a7Smrg }
242181254a7Smrg else
243181254a7Smrg {
244181254a7Smrg import std.algorithm.searching : countUntil;
245181254a7Smrg const ivals = set.byInterval;
246181254a7Smrg immutable n = charsets.countUntil(set);
247181254a7Smrg if (n >= 0)
248181254a7Smrg {
249181254a7Smrg if (ivals.length*2 > maxCharsetUsed)
250181254a7Smrg put(Bytecode(IR.Trie, cast(uint) n));
251181254a7Smrg else
252181254a7Smrg put(Bytecode(IR.CodepointSet, cast(uint) n));
253181254a7Smrg return;
254181254a7Smrg }
255181254a7Smrg if (ivals.length*2 > maxCharsetUsed)
256181254a7Smrg {
257181254a7Smrg auto t = getMatcher(set);
258181254a7Smrg put(Bytecode(IR.Trie, cast(uint) matchers.length));
259181254a7Smrg matchers ~= t;
260181254a7Smrg debug(std_regex_allocation) writeln("Trie generated");
261181254a7Smrg }
262181254a7Smrg else
263181254a7Smrg {
264181254a7Smrg put(Bytecode(IR.CodepointSet, cast(uint) charsets.length));
265181254a7Smrg matchers ~= CharMatcher.init;
266181254a7Smrg }
267181254a7Smrg charsets ~= set;
268181254a7Smrg assert(charsets.length == matchers.length);
269181254a7Smrg }
270181254a7Smrg }
271181254a7Smrg
genLogicGroupCodeGen272181254a7Smrg void genLogicGroup()
273181254a7Smrg {
274181254a7Smrg nesting++;
275181254a7Smrg pushFixup(length);
276181254a7Smrg put(Bytecode(IR.Nop, 0));
277181254a7Smrg }
278181254a7Smrg
genGroupCodeGen279181254a7Smrg void genGroup()
280181254a7Smrg {
281181254a7Smrg nesting++;
282181254a7Smrg pushFixup(length);
283181254a7Smrg immutable nglob = groupStack.top++;
284181254a7Smrg enforce(groupStack.top <= maxGroupNumber, "limit on number of submatches is exceeded");
285181254a7Smrg put(Bytecode(IR.GroupStart, nglob));
286181254a7Smrg }
287181254a7Smrg
genNamedGroupCodeGen288181254a7Smrg void genNamedGroup(string name)
289181254a7Smrg {
290181254a7Smrg import std.array : insertInPlace;
291181254a7Smrg import std.range : assumeSorted;
292181254a7Smrg nesting++;
293181254a7Smrg pushFixup(length);
294181254a7Smrg immutable nglob = groupStack.top++;
295181254a7Smrg enforce(groupStack.top <= maxGroupNumber, "limit on submatches is exceeded");
296181254a7Smrg auto t = NamedGroup(name, nglob);
297181254a7Smrg auto d = assumeSorted!"a.name < b.name"(dict);
298181254a7Smrg immutable ind = d.lowerBound(t).length;
299181254a7Smrg insertInPlace(dict, ind, t);
300181254a7Smrg put(Bytecode(IR.GroupStart, nglob));
301181254a7Smrg }
302181254a7Smrg
303181254a7Smrg //generate code for start of lookaround: (?= (?! (?<= (?<!
genLookaroundCodeGen304181254a7Smrg void genLookaround(IR opcode)
305181254a7Smrg {
306181254a7Smrg nesting++;
307181254a7Smrg pushFixup(length);
308181254a7Smrg put(Bytecode(opcode, 0));
309181254a7Smrg put(Bytecode.fromRaw(0));
310181254a7Smrg put(Bytecode.fromRaw(0));
311181254a7Smrg groupStack.push(0);
312181254a7Smrg lookaroundNest++;
313181254a7Smrg enforce(lookaroundNest <= maxLookaroundDepth,
314181254a7Smrg "maximum lookaround depth is exceeded");
315181254a7Smrg }
316181254a7Smrg
endPatternCodeGen317181254a7Smrg void endPattern(uint num)
318181254a7Smrg {
319181254a7Smrg import std.algorithm.comparison : max;
320181254a7Smrg put(Bytecode(IR.End, num));
321181254a7Smrg ngroup = max(ngroup, groupStack.top);
322181254a7Smrg groupStack.top = 1; // reset group counter
323181254a7Smrg }
324181254a7Smrg
325181254a7Smrg //fixup lookaround with start at offset fix and append a proper *-End opcode
fixLookaroundCodeGen326181254a7Smrg void fixLookaround(uint fix)
327181254a7Smrg {
328181254a7Smrg lookaroundNest--;
329181254a7Smrg ir[fix] = Bytecode(ir[fix].code,
330181254a7Smrg cast(uint) ir.length - fix - IRL!(IR.LookaheadStart));
331181254a7Smrg auto g = groupStack.pop();
332181254a7Smrg assert(!groupStack.empty);
333181254a7Smrg ir[fix+1] = Bytecode.fromRaw(groupStack.top);
334181254a7Smrg //groups are cumulative across lookarounds
335181254a7Smrg ir[fix+2] = Bytecode.fromRaw(groupStack.top+g);
336181254a7Smrg groupStack.top += g;
337181254a7Smrg if (ir[fix].code == IR.LookbehindStart || ir[fix].code == IR.NeglookbehindStart)
338181254a7Smrg {
339181254a7Smrg reverseBytecode(ir[fix + IRL!(IR.LookbehindStart) .. $]);
340181254a7Smrg }
341181254a7Smrg put(ir[fix].paired);
342181254a7Smrg }
343181254a7Smrg
344181254a7Smrg // repetition of {1,1}
fixRepetitionCodeGen345181254a7Smrg void fixRepetition(uint offset)
346181254a7Smrg {
347181254a7Smrg import std.algorithm.mutation : copy;
348181254a7Smrg immutable replace = ir[offset].code == IR.Nop;
349181254a7Smrg if (replace)
350181254a7Smrg {
351181254a7Smrg copy(ir[offset + 1 .. $], ir[offset .. $ - 1]);
352181254a7Smrg ir.length -= 1;
353181254a7Smrg }
354181254a7Smrg }
355181254a7Smrg
356181254a7Smrg // repetition of {x,y}
fixRepetitionCodeGen357181254a7Smrg void fixRepetition(uint offset, uint min, uint max, bool greedy)
358181254a7Smrg {
359181254a7Smrg static import std.algorithm.comparison;
360181254a7Smrg import std.algorithm.mutation : copy;
361181254a7Smrg import std.array : insertInPlace;
362181254a7Smrg immutable replace = ir[offset].code == IR.Nop;
363181254a7Smrg immutable len = cast(uint) ir.length - offset - replace;
364181254a7Smrg if (max != infinite)
365181254a7Smrg {
366181254a7Smrg if (min != 1 || max != 1)
367181254a7Smrg {
368181254a7Smrg Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len);
369181254a7Smrg if (replace)
370181254a7Smrg ir[offset] = op;
371181254a7Smrg else
372181254a7Smrg insertInPlace(ir, offset, op);
373181254a7Smrg put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len));
374181254a7Smrg put(Bytecode.init); //hotspot
375181254a7Smrg putRaw(1);
376181254a7Smrg putRaw(min);
377181254a7Smrg putRaw(max);
378181254a7Smrg counterDepth = std.algorithm.comparison.max(counterDepth, nesting+1);
379181254a7Smrg }
380181254a7Smrg }
381181254a7Smrg else if (min) //&& max is infinite
382181254a7Smrg {
383181254a7Smrg if (min != 1)
384181254a7Smrg {
385181254a7Smrg Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len);
386181254a7Smrg if (replace)
387181254a7Smrg ir[offset] = op;
388181254a7Smrg else
389181254a7Smrg insertInPlace(ir, offset, op);
390181254a7Smrg offset += 1;//so it still points to the repeated block
391181254a7Smrg put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len));
392181254a7Smrg put(Bytecode.init); //hotspot
393181254a7Smrg putRaw(1);
394181254a7Smrg putRaw(min);
395181254a7Smrg putRaw(min);
396181254a7Smrg counterDepth = std.algorithm.comparison.max(counterDepth, nesting+1);
397181254a7Smrg }
398181254a7Smrg else if (replace)
399181254a7Smrg {
400181254a7Smrg copy(ir[offset+1 .. $], ir[offset .. $-1]);
401181254a7Smrg ir.length -= 1;
402181254a7Smrg }
403181254a7Smrg put(Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len));
404181254a7Smrg enforce(ir.length + len < maxCompiledLength, "maximum compiled pattern length is exceeded");
405181254a7Smrg ir ~= ir[offset .. offset+len];
406181254a7Smrg //IR.InfinteX is always a hotspot
407181254a7Smrg put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len));
408181254a7Smrg put(Bytecode.init); //merge index
409181254a7Smrg }
410181254a7Smrg else//vanila {0,inf}
411181254a7Smrg {
412181254a7Smrg Bytecode op = Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len);
413181254a7Smrg if (replace)
414181254a7Smrg ir[offset] = op;
415181254a7Smrg else
416181254a7Smrg insertInPlace(ir, offset, op);
417181254a7Smrg //IR.InfinteX is always a hotspot
418181254a7Smrg put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len));
419181254a7Smrg put(Bytecode.init); //merge index
420181254a7Smrg }
421181254a7Smrg }
422181254a7Smrg
fixAlternationCodeGen423181254a7Smrg void fixAlternation()
424181254a7Smrg {
425181254a7Smrg import std.array : insertInPlace;
426181254a7Smrg uint fix = fixupStack.top;
427181254a7Smrg if (ir.length > fix && ir[fix].code == IR.Option)
428181254a7Smrg {
429181254a7Smrg ir[fix] = Bytecode(ir[fix].code, cast(uint) ir.length - fix);
430181254a7Smrg put(Bytecode(IR.GotoEndOr, 0));
431181254a7Smrg fixupStack.top = cast(uint) ir.length; //replace latest fixup for Option
432181254a7Smrg put(Bytecode(IR.Option, 0));
433181254a7Smrg return;
434181254a7Smrg }
435181254a7Smrg uint len, orStart;
436181254a7Smrg //start a new option
437181254a7Smrg if (fixupStack.length == 1)
438181254a7Smrg {//only root entry, effectively no fixup
439181254a7Smrg len = cast(uint) ir.length + IRL!(IR.GotoEndOr);
440181254a7Smrg orStart = 0;
441181254a7Smrg }
442181254a7Smrg else
443181254a7Smrg {//IR.lookahead, etc. fixups that have length > 1, thus check ir[x].length
444181254a7Smrg len = cast(uint) ir.length - fix - (ir[fix].length - 1);
445181254a7Smrg orStart = fix + ir[fix].length;
446181254a7Smrg }
447181254a7Smrg insertInPlace(ir, orStart, Bytecode(IR.OrStart, 0), Bytecode(IR.Option, len));
448181254a7Smrg assert(ir[orStart].code == IR.OrStart);
449181254a7Smrg put(Bytecode(IR.GotoEndOr, 0));
450181254a7Smrg fixupStack.push(orStart); //fixup for StartOR
451181254a7Smrg fixupStack.push(cast(uint) ir.length); //for second Option
452181254a7Smrg put(Bytecode(IR.Option, 0));
453181254a7Smrg }
454181254a7Smrg
455181254a7Smrg // finalizes IR.Option, fix points to the first option of sequence
finishAlternationCodeGen456181254a7Smrg void finishAlternation(uint fix)
457181254a7Smrg {
458181254a7Smrg enforce(ir[fix].code == IR.Option, "no matching ')'");
459181254a7Smrg ir[fix] = Bytecode(ir[fix].code, cast(uint) ir.length - fix - IRL!(IR.OrStart));
460181254a7Smrg fix = fixupStack.pop();
461181254a7Smrg enforce(ir[fix].code == IR.OrStart, "no matching ')'");
462181254a7Smrg ir[fix] = Bytecode(IR.OrStart, cast(uint) ir.length - fix - IRL!(IR.OrStart));
463181254a7Smrg put(Bytecode(IR.OrEnd, cast(uint) ir.length - fix - IRL!(IR.OrStart)));
464181254a7Smrg uint pc = fix + IRL!(IR.OrStart);
465181254a7Smrg while (ir[pc].code == IR.Option)
466181254a7Smrg {
467181254a7Smrg pc = pc + ir[pc].data;
468181254a7Smrg if (ir[pc].code != IR.GotoEndOr)
469181254a7Smrg break;
470181254a7Smrg ir[pc] = Bytecode(IR.GotoEndOr, cast(uint)(ir.length - pc - IRL!(IR.OrEnd)));
471181254a7Smrg pc += IRL!(IR.GotoEndOr);
472181254a7Smrg }
473181254a7Smrg put(Bytecode.fromRaw(0));
474181254a7Smrg }
475181254a7Smrg
476181254a7Smrg // returns: (flag - repetition possible?, fixup of the start of this "group")
477181254a7Smrg Tuple!(bool, uint) onClose()
478181254a7Smrg {
479181254a7Smrg nesting--;
480181254a7Smrg uint fix = popFixup();
481181254a7Smrg switch (ir[fix].code)
482181254a7Smrg {
483181254a7Smrg case IR.GroupStart:
484181254a7Smrg put(Bytecode(IR.GroupEnd, ir[fix].data));
485181254a7Smrg return tuple(true, fix);
486181254a7Smrg case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
487181254a7Smrg assert(lookaroundNest);
488181254a7Smrg fixLookaround(fix);
489181254a7Smrg return tuple(false, 0u);
490181254a7Smrg case IR.Option: //| xxx )
491181254a7Smrg //two fixups: last option + full OR
492181254a7Smrg finishAlternation(fix);
493181254a7Smrg fix = topFixup;
494181254a7Smrg switch (ir[fix].code)
495181254a7Smrg {
496181254a7Smrg case IR.GroupStart:
497181254a7Smrg popFixup();
498181254a7Smrg put(Bytecode(IR.GroupEnd, ir[fix].data));
499181254a7Smrg return tuple(true, fix);
500181254a7Smrg case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
501181254a7Smrg assert(lookaroundNest);
502181254a7Smrg fix = popFixup();
503181254a7Smrg fixLookaround(fix);
504181254a7Smrg return tuple(false, 0u);
505181254a7Smrg default://(?:xxx)
506181254a7Smrg popFixup();
507181254a7Smrg return tuple(true, fix);
508181254a7Smrg }
509181254a7Smrg default://(?:xxx)
510181254a7Smrg return tuple(true, fix);
511181254a7Smrg }
512181254a7Smrg }
513181254a7Smrg
popFixupCodeGen514181254a7Smrg uint popFixup(){ return fixupStack.pop(); }
515181254a7Smrg
pushFixupCodeGen516181254a7Smrg void pushFixup(uint val){ return fixupStack.push(val); }
517181254a7Smrg
topFixupCodeGen518181254a7Smrg @property uint topFixup(){ return fixupStack.top; }
519181254a7Smrg
fixupLengthCodeGen520181254a7Smrg @property size_t fixupLength(){ return fixupStack.data.length; }
521181254a7Smrg
lengthCodeGen522181254a7Smrg @property uint length(){ return cast(uint) ir.length; }
523181254a7Smrg }
524181254a7Smrg
525181254a7Smrg // safety limits
526181254a7Smrg enum maxGroupNumber = 2^^19;
527181254a7Smrg enum maxLookaroundDepth = 16;
528181254a7Smrg // *Bytecode.sizeof, i.e. 1Mb of bytecode alone
529181254a7Smrg enum maxCompiledLength = 2^^18;
530181254a7Smrg // amounts to up to 4 Mb of auxilary table for matching
531181254a7Smrg enum maxCumulativeRepetitionLength = 2^^20;
532181254a7Smrg // marker to indicate infinite repetition
533181254a7Smrg enum infinite = ~0u;
534181254a7Smrg
535181254a7Smrg struct Parser(R, Generator)
536181254a7Smrg if (isForwardRange!R && is(ElementType!R : dchar))
537181254a7Smrg {
538*b1e83836Smrg dchar front;
539181254a7Smrg bool empty;
540181254a7Smrg R pat, origin; //keep full pattern for pretty printing error messages
541181254a7Smrg uint re_flags = 0; //global flags e.g. multiline + internal ones
542181254a7Smrg Generator g;
543181254a7Smrg
544181254a7Smrg @trusted this(S)(R pattern, S flags)
545181254a7Smrg if (isSomeString!S)
546181254a7Smrg {
547181254a7Smrg pat = origin = pattern;
548181254a7Smrg //reserve slightly more then avg as sampled from unittests
549181254a7Smrg parseFlags(flags);
550*b1e83836Smrg front = ' ';//a safe default for freeform parsing
551*b1e83836Smrg popFront();
552181254a7Smrg g.start(cast(uint) pat.length);
553181254a7Smrg try
554181254a7Smrg {
555181254a7Smrg parseRegex();
556181254a7Smrg }
catch(Exception e)557181254a7Smrg catch (Exception e)
558181254a7Smrg {
559181254a7Smrg error(e.msg);//also adds pattern location
560181254a7Smrg }
561181254a7Smrg g.endPattern(1);
562181254a7Smrg }
563181254a7Smrg
_popFront()564*b1e83836Smrg void _popFront()
565181254a7Smrg {
566181254a7Smrg if (pat.empty)
567181254a7Smrg {
568181254a7Smrg empty = true;
569181254a7Smrg }
570*b1e83836Smrg else
571*b1e83836Smrg {
572*b1e83836Smrg front = pat.front;
573181254a7Smrg pat.popFront();
574*b1e83836Smrg }
575181254a7Smrg }
576181254a7Smrg
skipSpace()577181254a7Smrg void skipSpace()
578181254a7Smrg {
579*b1e83836Smrg while (!empty && isWhite(front)) _popFront();
580181254a7Smrg }
581181254a7Smrg
popFront()582*b1e83836Smrg void popFront()
583181254a7Smrg {
584*b1e83836Smrg _popFront();
585*b1e83836Smrg if (re_flags & RegexOption.freeform) skipSpace();
586181254a7Smrg }
587*b1e83836Smrg
save()588*b1e83836Smrg auto save(){ return this; }
589181254a7Smrg
590181254a7Smrg //parsing number with basic overflow check
parseDecimal()591181254a7Smrg uint parseDecimal()
592181254a7Smrg {
593181254a7Smrg uint r = 0;
594*b1e83836Smrg while (std.ascii.isDigit(front))
595181254a7Smrg {
596181254a7Smrg if (r >= (uint.max/10))
597181254a7Smrg error("Overflow in decimal number");
598*b1e83836Smrg r = 10*r + cast(uint)(front-'0');
599*b1e83836Smrg popFront();
600*b1e83836Smrg if (empty) break;
601181254a7Smrg }
602181254a7Smrg return r;
603181254a7Smrg }
604181254a7Smrg
605181254a7Smrg //
parseFlags(S)606181254a7Smrg @trusted void parseFlags(S)(S flags)
607181254a7Smrg {//@@@BUG@@@ text is @system
608181254a7Smrg import std.conv : text;
609181254a7Smrg foreach (ch; flags)//flags are ASCII anyway
610181254a7Smrg {
611181254a7Smrg L_FlagSwitch:
612181254a7Smrg switch (ch)
613181254a7Smrg {
614181254a7Smrg
615181254a7Smrg foreach (i, op; __traits(allMembers, RegexOption))
616181254a7Smrg {
617181254a7Smrg case RegexOptionNames[i]:
618181254a7Smrg if (re_flags & mixin("RegexOption."~op))
619181254a7Smrg throw new RegexException(text("redundant flag specified: ",ch));
620181254a7Smrg re_flags |= mixin("RegexOption."~op);
621181254a7Smrg break L_FlagSwitch;
622181254a7Smrg }
623181254a7Smrg default:
624181254a7Smrg throw new RegexException(text("unknown regex flag '",ch,"'"));
625181254a7Smrg }
626181254a7Smrg }
627181254a7Smrg }
628181254a7Smrg
629181254a7Smrg //parse and store IR for regex pattern
parseRegex()630181254a7Smrg @trusted void parseRegex()
631181254a7Smrg {
632181254a7Smrg uint fix;//fixup pointer
633181254a7Smrg
634181254a7Smrg while (!empty)
635181254a7Smrg {
636181254a7Smrg debug(std_regex_parser)
637181254a7Smrg __ctfe || writeln("*LR*\nSource: ", pat, "\nStack: ",fixupStack.data);
638*b1e83836Smrg switch (front)
639181254a7Smrg {
640181254a7Smrg case '(':
641*b1e83836Smrg popFront();
642*b1e83836Smrg if (front == '?')
643181254a7Smrg {
644*b1e83836Smrg popFront();
645*b1e83836Smrg switch (front)
646181254a7Smrg {
647181254a7Smrg case '#':
648181254a7Smrg for (;;)
649181254a7Smrg {
650*b1e83836Smrg popFront();
651*b1e83836Smrg enforce(!empty, "Unexpected end of pattern");
652*b1e83836Smrg if (front == ')')
653181254a7Smrg {
654*b1e83836Smrg popFront();
655181254a7Smrg break;
656181254a7Smrg }
657181254a7Smrg }
658181254a7Smrg break;
659181254a7Smrg case ':':
660181254a7Smrg g.genLogicGroup();
661*b1e83836Smrg popFront();
662181254a7Smrg break;
663181254a7Smrg case '=':
664181254a7Smrg g.genLookaround(IR.LookaheadStart);
665*b1e83836Smrg popFront();
666181254a7Smrg break;
667181254a7Smrg case '!':
668181254a7Smrg g.genLookaround(IR.NeglookaheadStart);
669*b1e83836Smrg popFront();
670181254a7Smrg break;
671181254a7Smrg case 'P':
672*b1e83836Smrg popFront();
673*b1e83836Smrg enforce(front == '<', "Expected '<' in named group");
674181254a7Smrg string name;
675*b1e83836Smrg popFront();
676*b1e83836Smrg if (empty || !(isAlpha(front) || front == '_'))
677181254a7Smrg error("Expected alpha starting a named group");
678*b1e83836Smrg name ~= front;
679*b1e83836Smrg popFront();
680*b1e83836Smrg while (!empty && (isAlpha(front) ||
681*b1e83836Smrg front == '_' || std.ascii.isDigit(front)))
682181254a7Smrg {
683*b1e83836Smrg name ~= front;
684*b1e83836Smrg popFront();
685181254a7Smrg }
686*b1e83836Smrg enforce(front == '>', "Expected '>' closing named group");
687*b1e83836Smrg popFront();
688181254a7Smrg g.genNamedGroup(name);
689181254a7Smrg break;
690181254a7Smrg case '<':
691*b1e83836Smrg popFront();
692*b1e83836Smrg if (front == '=')
693181254a7Smrg g.genLookaround(IR.LookbehindStart);
694*b1e83836Smrg else if (front == '!')
695181254a7Smrg g.genLookaround(IR.NeglookbehindStart);
696181254a7Smrg else
697181254a7Smrg error("'!' or '=' expected after '<'");
698*b1e83836Smrg popFront();
699181254a7Smrg break;
700181254a7Smrg default:
701181254a7Smrg uint enableFlags, disableFlags;
702181254a7Smrg bool enable = true;
703181254a7Smrg do
704181254a7Smrg {
705*b1e83836Smrg switch (front)
706181254a7Smrg {
707181254a7Smrg case 's':
708181254a7Smrg if (enable)
709181254a7Smrg enableFlags |= RegexOption.singleline;
710181254a7Smrg else
711181254a7Smrg disableFlags |= RegexOption.singleline;
712181254a7Smrg break;
713181254a7Smrg case 'x':
714181254a7Smrg if (enable)
715181254a7Smrg enableFlags |= RegexOption.freeform;
716181254a7Smrg else
717181254a7Smrg disableFlags |= RegexOption.freeform;
718181254a7Smrg break;
719181254a7Smrg case 'i':
720181254a7Smrg if (enable)
721181254a7Smrg enableFlags |= RegexOption.casefold;
722181254a7Smrg else
723181254a7Smrg disableFlags |= RegexOption.casefold;
724181254a7Smrg break;
725181254a7Smrg case 'm':
726181254a7Smrg if (enable)
727181254a7Smrg enableFlags |= RegexOption.multiline;
728181254a7Smrg else
729181254a7Smrg disableFlags |= RegexOption.multiline;
730181254a7Smrg break;
731181254a7Smrg case '-':
732181254a7Smrg if (!enable)
733181254a7Smrg error(" unexpected second '-' in flags");
734181254a7Smrg enable = false;
735181254a7Smrg break;
736181254a7Smrg default:
737181254a7Smrg error(" 's', 'x', 'i', 'm' or '-' expected after '(?' ");
738181254a7Smrg }
739*b1e83836Smrg popFront();
740*b1e83836Smrg }while (front != ')');
741*b1e83836Smrg popFront();
742181254a7Smrg re_flags |= enableFlags;
743181254a7Smrg re_flags &= ~disableFlags;
744181254a7Smrg }
745181254a7Smrg }
746181254a7Smrg else
747181254a7Smrg {
748181254a7Smrg g.genGroup();
749181254a7Smrg }
750181254a7Smrg break;
751181254a7Smrg case ')':
752181254a7Smrg enforce(g.nesting, "Unmatched ')'");
753*b1e83836Smrg popFront();
754181254a7Smrg auto pair = g.onClose();
755181254a7Smrg if (pair[0])
756181254a7Smrg parseQuantifier(pair[1]);
757181254a7Smrg break;
758181254a7Smrg case '|':
759*b1e83836Smrg popFront();
760181254a7Smrg g.fixAlternation();
761181254a7Smrg break;
762181254a7Smrg default://no groups or whatever
763181254a7Smrg immutable start = g.length;
764181254a7Smrg parseAtom();
765181254a7Smrg parseQuantifier(start);
766181254a7Smrg }
767181254a7Smrg }
768181254a7Smrg
769181254a7Smrg if (g.fixupLength != 1)
770181254a7Smrg {
771181254a7Smrg fix = g.popFixup();
772181254a7Smrg g.finishAlternation(fix);
773181254a7Smrg enforce(g.fixupLength == 1, "no matching ')'");
774181254a7Smrg }
775181254a7Smrg }
776181254a7Smrg
777181254a7Smrg
778181254a7Smrg //parse and store IR for atom-quantifier pair
parseQuantifier(uint offset)779181254a7Smrg @trusted void parseQuantifier(uint offset)
780181254a7Smrg {//copy is @system
781181254a7Smrg if (empty)
782181254a7Smrg return g.fixRepetition(offset);
783181254a7Smrg uint min, max;
784*b1e83836Smrg switch (front)
785181254a7Smrg {
786181254a7Smrg case '*':
787181254a7Smrg min = 0;
788181254a7Smrg max = infinite;
789181254a7Smrg break;
790181254a7Smrg case '?':
791181254a7Smrg min = 0;
792181254a7Smrg max = 1;
793181254a7Smrg break;
794181254a7Smrg case '+':
795181254a7Smrg min = 1;
796181254a7Smrg max = infinite;
797181254a7Smrg break;
798181254a7Smrg case '{':
799*b1e83836Smrg popFront();
800*b1e83836Smrg enforce(!empty, "Unexpected end of regex pattern");
801*b1e83836Smrg enforce(std.ascii.isDigit(front), "First number required in repetition");
802181254a7Smrg min = parseDecimal();
803*b1e83836Smrg if (front == '}')
804181254a7Smrg max = min;
805*b1e83836Smrg else if (front == ',')
806181254a7Smrg {
807*b1e83836Smrg popFront();
808*b1e83836Smrg if (std.ascii.isDigit(front))
809181254a7Smrg max = parseDecimal();
810*b1e83836Smrg else if (front == '}')
811181254a7Smrg max = infinite;
812181254a7Smrg else
813181254a7Smrg error("Unexpected symbol in regex pattern");
814181254a7Smrg skipSpace();
815*b1e83836Smrg enforce(front == '}', "Unmatched '{' in regex pattern");
816181254a7Smrg }
817181254a7Smrg else
818181254a7Smrg error("Unexpected symbol in regex pattern");
819*b1e83836Smrg enforce(min <= max, "Illegal {n,m} quantifier");
820181254a7Smrg break;
821181254a7Smrg default:
822181254a7Smrg g.fixRepetition(offset);
823181254a7Smrg return;
824181254a7Smrg }
825181254a7Smrg bool greedy = true;
826181254a7Smrg //check only if we managed to get new symbol
827*b1e83836Smrg popFront();
828*b1e83836Smrg if (!empty && front == '?')
829181254a7Smrg {
830181254a7Smrg greedy = false;
831*b1e83836Smrg popFront();
832181254a7Smrg }
833181254a7Smrg g.fixRepetition(offset, min, max, greedy);
834181254a7Smrg }
835181254a7Smrg
836181254a7Smrg //parse and store IR for atom
parseAtom()837181254a7Smrg void parseAtom()
838181254a7Smrg {
839181254a7Smrg if (empty)
840181254a7Smrg return;
841*b1e83836Smrg switch (front)
842181254a7Smrg {
843181254a7Smrg case '*', '?', '+', '|', '{', '}':
844*b1e83836Smrg return error("'*', '+', '?', '{', '}' not allowed in atom");
845181254a7Smrg case '.':
846181254a7Smrg if (re_flags & RegexOption.singleline)
847181254a7Smrg g.put(Bytecode(IR.Any, 0));
848181254a7Smrg else
849181254a7Smrg {
850181254a7Smrg CodepointSet set;
851181254a7Smrg g.charsetToIr(set.add('\n','\n'+1).add('\r', '\r'+1).inverted);
852181254a7Smrg }
853*b1e83836Smrg popFront();
854181254a7Smrg break;
855181254a7Smrg case '[':
856181254a7Smrg parseCharset();
857181254a7Smrg break;
858181254a7Smrg case '\\':
859*b1e83836Smrg _popFront();
860*b1e83836Smrg enforce(!empty, "Unfinished escape sequence");
861181254a7Smrg parseEscape();
862181254a7Smrg break;
863181254a7Smrg case '^':
864181254a7Smrg if (re_flags & RegexOption.multiline)
865181254a7Smrg g.put(Bytecode(IR.Bol, 0));
866181254a7Smrg else
867181254a7Smrg g.put(Bytecode(IR.Bof, 0));
868*b1e83836Smrg popFront();
869181254a7Smrg break;
870181254a7Smrg case '$':
871181254a7Smrg if (re_flags & RegexOption.multiline)
872181254a7Smrg g.put(Bytecode(IR.Eol, 0));
873181254a7Smrg else
874181254a7Smrg g.put(Bytecode(IR.Eof, 0));
875*b1e83836Smrg popFront();
876181254a7Smrg break;
877181254a7Smrg default:
878181254a7Smrg if (re_flags & RegexOption.casefold)
879181254a7Smrg {
880*b1e83836Smrg auto range = simpleCaseFoldings(front);
881181254a7Smrg assert(range.length <= 5);
882181254a7Smrg if (range.length == 1)
883181254a7Smrg g.put(Bytecode(IR.Char, range.front));
884181254a7Smrg else
885181254a7Smrg foreach (v; range)
886181254a7Smrg g.put(Bytecode(IR.OrChar, v, cast(uint) range.length));
887181254a7Smrg }
888181254a7Smrg else
889*b1e83836Smrg g.put(Bytecode(IR.Char, front));
890*b1e83836Smrg popFront();
891181254a7Smrg }
892181254a7Smrg }
893181254a7Smrg
894181254a7Smrg //parse and store IR for CodepointSet
parseCharset()895181254a7Smrg void parseCharset()
896181254a7Smrg {
897181254a7Smrg const save = re_flags;
898181254a7Smrg re_flags &= ~RegexOption.freeform; // stop ignoring whitespace if we did
899*b1e83836Smrg bool casefold = cast(bool)(re_flags & RegexOption.casefold);
900*b1e83836Smrg g.charsetToIr(unicode.parseSet(this, casefold));
901181254a7Smrg re_flags = save;
902*b1e83836Smrg // Last next() in parseCharset is executed w/o freeform flag
903181254a7Smrg if (re_flags & RegexOption.freeform) skipSpace();
904181254a7Smrg }
905181254a7Smrg
906181254a7Smrg //parse and generate IR for escape stand alone escape sequence
parseEscape()907181254a7Smrg @trusted void parseEscape()
908181254a7Smrg {//accesses array of appender
909181254a7Smrg import std.algorithm.iteration : sum;
910*b1e83836Smrg switch (front)
911181254a7Smrg {
912*b1e83836Smrg case 'f': popFront(); g.put(Bytecode(IR.Char, '\f')); break;
913*b1e83836Smrg case 'n': popFront(); g.put(Bytecode(IR.Char, '\n')); break;
914*b1e83836Smrg case 'r': popFront(); g.put(Bytecode(IR.Char, '\r')); break;
915*b1e83836Smrg case 't': popFront(); g.put(Bytecode(IR.Char, '\t')); break;
916*b1e83836Smrg case 'v': popFront(); g.put(Bytecode(IR.Char, '\v')); break;
917181254a7Smrg
918181254a7Smrg case 'd':
919*b1e83836Smrg popFront();
920181254a7Smrg g.charsetToIr(unicode.Nd);
921181254a7Smrg break;
922181254a7Smrg case 'D':
923*b1e83836Smrg popFront();
924181254a7Smrg g.charsetToIr(unicode.Nd.inverted);
925181254a7Smrg break;
926*b1e83836Smrg case 'b': popFront(); g.put(Bytecode(IR.Wordboundary, 0)); break;
927*b1e83836Smrg case 'B': popFront(); g.put(Bytecode(IR.Notwordboundary, 0)); break;
928181254a7Smrg case 's':
929*b1e83836Smrg popFront();
930181254a7Smrg g.charsetToIr(unicode.White_Space);
931181254a7Smrg break;
932181254a7Smrg case 'S':
933*b1e83836Smrg popFront();
934181254a7Smrg g.charsetToIr(unicode.White_Space.inverted);
935181254a7Smrg break;
936181254a7Smrg case 'w':
937*b1e83836Smrg popFront();
938181254a7Smrg g.charsetToIr(wordCharacter);
939181254a7Smrg break;
940181254a7Smrg case 'W':
941*b1e83836Smrg popFront();
942181254a7Smrg g.charsetToIr(wordCharacter.inverted);
943181254a7Smrg break;
944181254a7Smrg case 'p': case 'P':
945*b1e83836Smrg bool casefold = cast(bool)(re_flags & RegexOption.casefold);
946*b1e83836Smrg auto set = unicode.parsePropertySpec(this, front == 'P', casefold);
947*b1e83836Smrg g.charsetToIr(set);
948181254a7Smrg break;
949181254a7Smrg case 'x':
950181254a7Smrg immutable code = parseUniHex(pat, 2);
951*b1e83836Smrg popFront();
952181254a7Smrg g.put(Bytecode(IR.Char,code));
953181254a7Smrg break;
954181254a7Smrg case 'u': case 'U':
955*b1e83836Smrg immutable code = parseUniHex(pat, front == 'u' ? 4 : 8);
956*b1e83836Smrg popFront();
957181254a7Smrg g.put(Bytecode(IR.Char, code));
958181254a7Smrg break;
959181254a7Smrg case 'c': //control codes
960*b1e83836Smrg Bytecode code = Bytecode(IR.Char, unicode.parseControlCode(this));
961*b1e83836Smrg popFront();
962181254a7Smrg g.put(code);
963181254a7Smrg break;
964181254a7Smrg case '0':
965*b1e83836Smrg popFront();
966181254a7Smrg g.put(Bytecode(IR.Char, 0));//NUL character
967181254a7Smrg break;
968181254a7Smrg case '1': .. case '9':
969*b1e83836Smrg uint nref = cast(uint) front - '0';
970181254a7Smrg immutable maxBackref = sum(g.groupStack.data);
971181254a7Smrg enforce(nref < maxBackref, "Backref to unseen group");
972181254a7Smrg //perl's disambiguation rule i.e.
973181254a7Smrg //get next digit only if there is such group number
974*b1e83836Smrg popFront();
975*b1e83836Smrg while (nref < maxBackref && !empty && std.ascii.isDigit(front))
976181254a7Smrg {
977*b1e83836Smrg nref = nref * 10 + front - '0';
978*b1e83836Smrg popFront();
979181254a7Smrg }
980181254a7Smrg if (nref >= maxBackref)
981181254a7Smrg nref /= 10;
982181254a7Smrg enforce(!g.isOpenGroup(nref), "Backref to open group");
983181254a7Smrg uint localLimit = maxBackref - g.groupStack.top;
984181254a7Smrg if (nref >= localLimit)
985181254a7Smrg {
986181254a7Smrg g.put(Bytecode(IR.Backref, nref-localLimit));
987181254a7Smrg g.ir[$-1].setLocalRef();
988181254a7Smrg }
989181254a7Smrg else
990181254a7Smrg g.put(Bytecode(IR.Backref, nref));
991181254a7Smrg g.markBackref(nref);
992181254a7Smrg break;
993181254a7Smrg default:
994*b1e83836Smrg if (front == '\\' && !pat.empty)
995181254a7Smrg {
996*b1e83836Smrg if (pat.front >= privateUseStart && pat.front <= privateUseEnd)
997181254a7Smrg enforce(false, "invalid escape sequence");
998181254a7Smrg }
999*b1e83836Smrg if (front >= privateUseStart && front <= privateUseEnd)
1000181254a7Smrg {
1001*b1e83836Smrg g.endPattern(front - privateUseStart + 1);
1002181254a7Smrg break;
1003181254a7Smrg }
1004*b1e83836Smrg auto op = Bytecode(IR.Char, front);
1005*b1e83836Smrg popFront();
1006181254a7Smrg g.put(op);
1007181254a7Smrg }
1008181254a7Smrg }
1009181254a7Smrg
1010181254a7Smrg //
error(string msg)1011181254a7Smrg @trusted void error(string msg)
1012181254a7Smrg {
1013181254a7Smrg import std.array : appender;
1014*b1e83836Smrg import std.format.write : formattedWrite;
1015181254a7Smrg auto app = appender!string();
1016181254a7Smrg formattedWrite(app, "%s\nPattern with error: `%s` <--HERE-- `%s`",
1017181254a7Smrg msg, origin[0..$-pat.length], pat);
1018181254a7Smrg throw new RegexException(app.data);
1019181254a7Smrg }
1020181254a7Smrg
1021181254a7Smrg alias Char = BasicElementOf!R;
1022181254a7Smrg
program()1023181254a7Smrg @property program()
1024181254a7Smrg {
1025181254a7Smrg return makeRegex(this);
1026181254a7Smrg }
1027181254a7Smrg }
1028181254a7Smrg
1029181254a7Smrg /+
1030181254a7Smrg Postproces the IR, then optimize.
1031181254a7Smrg +/
postprocess(Char)1032181254a7Smrg @trusted void postprocess(Char)(ref Regex!Char zis)
1033181254a7Smrg {//@@@BUG@@@ write is @system
1034181254a7Smrg with(zis)
1035181254a7Smrg {
1036181254a7Smrg struct FixedStack(T)
1037181254a7Smrg {
1038181254a7Smrg T[] arr;
1039181254a7Smrg uint _top;
1040181254a7Smrg //this(T[] storage){ arr = storage; _top = -1; }
1041181254a7Smrg @property ref T top(){ assert(!empty); return arr[_top]; }
1042181254a7Smrg void push(T x){ arr[++_top] = x; }
1043181254a7Smrg T pop() { assert(!empty); return arr[_top--]; }
1044181254a7Smrg @property bool empty(){ return _top == -1; }
1045181254a7Smrg }
1046181254a7Smrg auto counterRange = FixedStack!uint(new uint[maxCounterDepth+1], -1);
1047181254a7Smrg counterRange.push(1);
1048181254a7Smrg ulong cumRange = 0;
1049181254a7Smrg for (uint i = 0; i < ir.length; i += ir[i].length)
1050181254a7Smrg {
1051181254a7Smrg if (ir[i].hotspot)
1052181254a7Smrg {
1053181254a7Smrg assert(i + 1 < ir.length,
1054181254a7Smrg "unexpected end of IR while looking for hotspot");
1055181254a7Smrg ir[i+1] = Bytecode.fromRaw(hotspotTableSize);
1056181254a7Smrg hotspotTableSize += counterRange.top;
1057181254a7Smrg }
1058181254a7Smrg switch (ir[i].code)
1059181254a7Smrg {
1060181254a7Smrg case IR.RepeatStart, IR.RepeatQStart:
1061181254a7Smrg uint repEnd = cast(uint)(i + ir[i].data + IRL!(IR.RepeatStart));
1062181254a7Smrg assert(ir[repEnd].code == ir[i].paired.code);
1063181254a7Smrg immutable max = ir[repEnd + 4].raw;
1064181254a7Smrg ir[repEnd+2].raw = counterRange.top;
1065181254a7Smrg ir[repEnd+3].raw *= counterRange.top;
1066181254a7Smrg ir[repEnd+4].raw *= counterRange.top;
1067181254a7Smrg ulong cntRange = cast(ulong)(max)*counterRange.top;
1068181254a7Smrg cumRange += cntRange;
1069181254a7Smrg enforce(cumRange < maxCumulativeRepetitionLength,
1070181254a7Smrg "repetition length limit is exceeded");
1071181254a7Smrg counterRange.push(cast(uint) cntRange + counterRange.top);
1072181254a7Smrg threadCount += counterRange.top;
1073181254a7Smrg break;
1074181254a7Smrg case IR.RepeatEnd, IR.RepeatQEnd:
1075181254a7Smrg threadCount += counterRange.top;
1076181254a7Smrg counterRange.pop();
1077181254a7Smrg break;
1078181254a7Smrg case IR.GroupStart:
1079181254a7Smrg if (isBackref(ir[i].data))
1080181254a7Smrg ir[i].setBackrefence();
1081181254a7Smrg threadCount += counterRange.top;
1082181254a7Smrg break;
1083181254a7Smrg case IR.GroupEnd:
1084181254a7Smrg if (isBackref(ir[i].data))
1085181254a7Smrg ir[i].setBackrefence();
1086181254a7Smrg threadCount += counterRange.top;
1087181254a7Smrg break;
1088181254a7Smrg default:
1089181254a7Smrg threadCount += counterRange.top;
1090181254a7Smrg }
1091181254a7Smrg }
1092181254a7Smrg checkIfOneShot();
1093181254a7Smrg if (!(flags & RegexInfo.oneShot))
1094181254a7Smrg kickstart = Kickstart!Char(zis, new uint[](256));
1095181254a7Smrg debug(std_regex_allocation) writefln("IR processed, max threads: %d", threadCount);
1096181254a7Smrg optimize(zis);
1097181254a7Smrg }
1098181254a7Smrg }
1099181254a7Smrg
fixupBytecode()1100181254a7Smrg void fixupBytecode()(Bytecode[] ir)
1101181254a7Smrg {
1102181254a7Smrg Stack!uint fixups;
1103181254a7Smrg
1104181254a7Smrg with(IR) for (uint i=0; i<ir.length; i+= ir[i].length)
1105181254a7Smrg {
1106181254a7Smrg if (ir[i].isStart || ir[i].code == Option)
1107181254a7Smrg fixups.push(i);
1108181254a7Smrg else if (ir[i].code == OrEnd)
1109181254a7Smrg {
1110181254a7Smrg // Alternatives need more care
1111181254a7Smrg auto j = fixups.pop(); // last Option
1112181254a7Smrg ir[j].data = i - j - ir[j].length;
1113181254a7Smrg j = fixups.pop(); // OrStart
1114181254a7Smrg ir[j].data = i - j - ir[j].length;
1115181254a7Smrg ir[i].data = ir[j].data;
1116181254a7Smrg
1117181254a7Smrg // fixup all GotoEndOrs
1118181254a7Smrg j = j + IRL!(OrStart);
1119181254a7Smrg assert(ir[j].code == Option);
1120181254a7Smrg for (;;)
1121181254a7Smrg {
1122181254a7Smrg auto next = j + ir[j].data + IRL!(Option);
1123181254a7Smrg if (ir[next].code == IR.OrEnd)
1124181254a7Smrg break;
1125181254a7Smrg ir[next - IRL!(GotoEndOr)].data = i - next;
1126181254a7Smrg j = next;
1127181254a7Smrg }
1128181254a7Smrg }
1129181254a7Smrg else if (ir[i].code == GotoEndOr)
1130181254a7Smrg {
1131181254a7Smrg auto j = fixups.pop(); // Option
1132181254a7Smrg ir[j].data = i - j + IRL!(GotoEndOr)- IRL!(Option); // to the next option
1133181254a7Smrg }
1134181254a7Smrg else if (ir[i].isEnd)
1135181254a7Smrg {
1136181254a7Smrg auto j = fixups.pop();
1137181254a7Smrg ir[i].data = i - j - ir[j].length;
1138181254a7Smrg ir[j].data = ir[i].data;
1139181254a7Smrg }
1140181254a7Smrg }
1141181254a7Smrg assert(fixups.empty);
1142181254a7Smrg }
1143181254a7Smrg
optimize(Char)1144181254a7Smrg void optimize(Char)(ref Regex!Char zis)
1145181254a7Smrg {
1146181254a7Smrg import std.array : insertInPlace;
1147181254a7Smrg CodepointSet nextSet(uint idx)
1148181254a7Smrg {
1149181254a7Smrg CodepointSet set;
1150181254a7Smrg with(zis) with(IR)
1151181254a7Smrg Outer:
1152181254a7Smrg for (uint i = idx; i < ir.length; i += ir[i].length)
1153181254a7Smrg {
1154181254a7Smrg switch (ir[i].code)
1155181254a7Smrg {
1156181254a7Smrg case Char:
1157181254a7Smrg set.add(ir[i].data, ir[i].data+1);
1158181254a7Smrg goto default;
1159181254a7Smrg //TODO: OrChar
1160181254a7Smrg case Trie, CodepointSet:
1161181254a7Smrg set = zis.charsets[ir[i].data];
1162181254a7Smrg goto default;
1163181254a7Smrg case GroupStart,GroupEnd:
1164181254a7Smrg break;
1165181254a7Smrg default:
1166181254a7Smrg break Outer;
1167181254a7Smrg }
1168181254a7Smrg }
1169181254a7Smrg return set;
1170181254a7Smrg }
1171181254a7Smrg
1172181254a7Smrg with(zis) with(IR) for (uint i = 0; i < ir.length; i += ir[i].length)
1173181254a7Smrg {
1174181254a7Smrg if (ir[i].code == InfiniteEnd)
1175181254a7Smrg {
1176181254a7Smrg auto set = nextSet(i+IRL!(InfiniteEnd));
1177181254a7Smrg if (!set.empty && set.length < 10_000)
1178181254a7Smrg {
1179181254a7Smrg ir[i] = Bytecode(InfiniteBloomEnd, ir[i].data);
1180181254a7Smrg ir[i - ir[i].data - IRL!(InfiniteStart)] =
1181181254a7Smrg Bytecode(InfiniteBloomStart, ir[i].data);
1182181254a7Smrg ir.insertInPlace(i+IRL!(InfiniteEnd),
1183181254a7Smrg Bytecode.fromRaw(cast(uint) zis.filters.length));
1184181254a7Smrg zis.filters ~= BitTable(set);
1185181254a7Smrg fixupBytecode(ir);
1186181254a7Smrg }
1187181254a7Smrg }
1188181254a7Smrg }
1189181254a7Smrg }
1190181254a7Smrg
1191181254a7Smrg //IR code validator - proper nesting, illegal instructions, etc.
validateRe(Char)1192181254a7Smrg @trusted void validateRe(Char)(ref Regex!Char zis)
1193181254a7Smrg {//@@@BUG@@@ text is @system
1194181254a7Smrg import std.conv : text;
1195181254a7Smrg with(zis)
1196181254a7Smrg {
1197181254a7Smrg for (uint pc = 0; pc < ir.length; pc += ir[pc].length)
1198181254a7Smrg {
1199181254a7Smrg if (ir[pc].isStart || ir[pc].isEnd)
1200181254a7Smrg {
1201181254a7Smrg immutable dest = ir[pc].indexOfPair(pc);
1202181254a7Smrg assert(dest < ir.length, text("Wrong length in opcode at pc=",
1203181254a7Smrg pc, " ", dest, " vs ", ir.length));
1204181254a7Smrg assert(ir[dest].paired == ir[pc],
1205181254a7Smrg text("Wrong pairing of opcodes at pc=", pc, "and pc=", dest));
1206181254a7Smrg }
1207181254a7Smrg else if (ir[pc].isAtom)
1208181254a7Smrg {
1209181254a7Smrg
1210181254a7Smrg }
1211181254a7Smrg else
1212181254a7Smrg assert(0, text("Unknown type of instruction at pc=", pc));
1213181254a7Smrg }
1214181254a7Smrg }
1215181254a7Smrg }
1216