xref: /netbsd-src/external/gpl3/gcc/dist/libphobos/src/std/regex/internal/parser.d (revision b1e838363e3c6fc78a55519254d99869742dd33c)
1181254a7Smrg //Written in the D programming language
2181254a7Smrg /*
3181254a7Smrg     Regular expression pattern parser.
4181254a7Smrg */
5181254a7Smrg module std.regex.internal.parser;
6181254a7Smrg 
7*b1e83836Smrg import std.regex.internal.ir;
8181254a7Smrg import std.range.primitives, std.uni, std.meta,
9181254a7Smrg     std.traits, std.typecons, std.exception;
10*b1e83836Smrg static import std.ascii;
11181254a7Smrg 
12181254a7Smrg // package relevant info from parser into a regex object
makeRegex(S,CG)13181254a7Smrg auto makeRegex(S, CG)(Parser!(S, CG) p)
14181254a7Smrg {
15*b1e83836Smrg     import std.regex.internal.backtracking : BacktrackingMatcher;
16*b1e83836Smrg     import std.regex.internal.thompson : ThompsonMatcher;
17*b1e83836Smrg     import std.algorithm.searching : canFind;
18*b1e83836Smrg     alias Char = BasicElementOf!S;
19*b1e83836Smrg     Regex!Char re;
20181254a7Smrg     auto g = p.g;
21181254a7Smrg     with(re)
22181254a7Smrg     {
23181254a7Smrg         ir = g.ir;
24181254a7Smrg         dict = g.dict;
25181254a7Smrg         ngroup = g.ngroup;
26181254a7Smrg         maxCounterDepth = g.counterDepth;
27181254a7Smrg         flags = p.re_flags;
28181254a7Smrg         charsets = g.charsets;
29181254a7Smrg         matchers = g.matchers;
30181254a7Smrg         backrefed = g.backrefed;
31*b1e83836Smrg         re.pattern = p.origin.idup;
32181254a7Smrg         re.postprocess();
33*b1e83836Smrg         // check if we have backreferences, if so - use backtracking
34*b1e83836Smrg         if (__ctfe) factory = null; // allows us to use the awful enum re = regex(...);
35*b1e83836Smrg         else if (re.backrefed.canFind!"a != 0")
36*b1e83836Smrg             factory =  new RuntimeFactory!(BacktrackingMatcher, Char);
37*b1e83836Smrg         else
38*b1e83836Smrg             factory = new RuntimeFactory!(ThompsonMatcher, Char);
39181254a7Smrg         debug(std_regex_parser)
40181254a7Smrg         {
41181254a7Smrg             __ctfe || print();
42181254a7Smrg         }
43181254a7Smrg         //@@@BUG@@@ (not reduced)
44181254a7Smrg         //somehow just using validate _collides_ with std.utf.validate (!)
45181254a7Smrg         version (assert) re.validateRe();
46181254a7Smrg     }
47181254a7Smrg     return re;
48181254a7Smrg }
49181254a7Smrg 
50181254a7Smrg // helper for unittest
51181254a7Smrg auto makeRegex(S)(S arg)
52181254a7Smrg if (isSomeString!S)
53181254a7Smrg {
54181254a7Smrg     return makeRegex(Parser!(S, CodeGen)(arg, ""));
55181254a7Smrg }
56181254a7Smrg 
57181254a7Smrg @system unittest
58181254a7Smrg {
59181254a7Smrg     import std.algorithm.comparison : equal;
60181254a7Smrg     auto re = makeRegex(`(?P<name>\w+) = (?P<var>\d+)`);
61181254a7Smrg     auto nc = re.namedCaptures;
62181254a7Smrg     static assert(isRandomAccessRange!(typeof(nc)));
63181254a7Smrg     assert(!nc.empty);
64181254a7Smrg     assert(nc.length == 2);
65181254a7Smrg     assert(nc.equal(["name", "var"]));
66181254a7Smrg     assert(nc[0] == "name");
67181254a7Smrg     assert(nc[1..$].equal(["var"]));
68181254a7Smrg 
69181254a7Smrg     re = makeRegex(`(\w+) (?P<named>\w+) (\w+)`);
70181254a7Smrg     nc = re.namedCaptures;
71181254a7Smrg     assert(nc.length == 1);
72181254a7Smrg     assert(nc[0] == "named");
73181254a7Smrg     assert(nc.front == "named");
74181254a7Smrg     assert(nc.back == "named");
75181254a7Smrg 
76181254a7Smrg     re = makeRegex(`(\w+) (\w+)`);
77181254a7Smrg     nc = re.namedCaptures;
78181254a7Smrg     assert(nc.empty);
79181254a7Smrg 
80181254a7Smrg     re = makeRegex(`(?P<year>\d{4})/(?P<month>\d{2})/(?P<day>\d{2})/`);
81181254a7Smrg     nc = re.namedCaptures;
82181254a7Smrg     auto cp = nc.save;
83181254a7Smrg     assert(nc.equal(cp));
84181254a7Smrg     nc.popFront();
85181254a7Smrg     assert(nc.equal(cp[1..$]));
86181254a7Smrg     nc.popBack();
87181254a7Smrg     assert(nc.equal(cp[1 .. $ - 1]));
88181254a7Smrg }
89181254a7Smrg 
90181254a7Smrg 
reverseBytecode()91181254a7Smrg @trusted void reverseBytecode()(Bytecode[] code)
92181254a7Smrg {
93181254a7Smrg     Bytecode[] rev = new Bytecode[code.length];
94181254a7Smrg     uint revPc = cast(uint) rev.length;
95181254a7Smrg     Stack!(Tuple!(uint, uint, uint)) stack;
96181254a7Smrg     uint start = 0;
97181254a7Smrg     uint end = cast(uint) code.length;
98181254a7Smrg     for (;;)
99181254a7Smrg     {
100181254a7Smrg         for (uint pc = start; pc < end; )
101181254a7Smrg         {
102181254a7Smrg             immutable len = code[pc].length;
103181254a7Smrg             if (code[pc].code == IR.GotoEndOr)
104181254a7Smrg                 break; //pick next alternation branch
105181254a7Smrg             if (code[pc].isAtom)
106181254a7Smrg             {
107181254a7Smrg                 rev[revPc - len .. revPc] = code[pc .. pc + len];
108181254a7Smrg                 revPc -= len;
109181254a7Smrg                 pc += len;
110181254a7Smrg             }
111181254a7Smrg             else if (code[pc].isStart || code[pc].isEnd)
112181254a7Smrg             {
113181254a7Smrg                 //skip over other embedded lookbehinds they are reversed
114181254a7Smrg                 if (code[pc].code == IR.LookbehindStart
115181254a7Smrg                     || code[pc].code == IR.NeglookbehindStart)
116181254a7Smrg                 {
117181254a7Smrg                     immutable blockLen = len + code[pc].data
118181254a7Smrg                          + code[pc].pairedLength;
119181254a7Smrg                     rev[revPc - blockLen .. revPc] = code[pc .. pc + blockLen];
120181254a7Smrg                     pc += blockLen;
121181254a7Smrg                     revPc -= blockLen;
122181254a7Smrg                     continue;
123181254a7Smrg                 }
124181254a7Smrg                 immutable second = code[pc].indexOfPair(pc);
125181254a7Smrg                 immutable secLen = code[second].length;
126181254a7Smrg                 rev[revPc - secLen .. revPc] = code[second .. second + secLen];
127181254a7Smrg                 revPc -= secLen;
128181254a7Smrg                 if (code[pc].code == IR.OrStart)
129181254a7Smrg                 {
130181254a7Smrg                     //we pass len bytes forward, but secLen in reverse
131181254a7Smrg                     immutable revStart = revPc - (second + len - secLen - pc);
132181254a7Smrg                     uint r = revStart;
133181254a7Smrg                     uint i = pc + IRL!(IR.OrStart);
134181254a7Smrg                     while (code[i].code == IR.Option)
135181254a7Smrg                     {
136181254a7Smrg                         if (code[i - 1].code != IR.OrStart)
137181254a7Smrg                         {
138181254a7Smrg                             assert(code[i - 1].code == IR.GotoEndOr);
139181254a7Smrg                             rev[r - 1] = code[i - 1];
140181254a7Smrg                         }
141181254a7Smrg                         rev[r] = code[i];
142181254a7Smrg                         auto newStart = i + IRL!(IR.Option);
143181254a7Smrg                         auto newEnd = newStart + code[i].data;
144181254a7Smrg                         auto newRpc = r + code[i].data + IRL!(IR.Option);
145181254a7Smrg                         if (code[newEnd].code != IR.OrEnd)
146181254a7Smrg                         {
147181254a7Smrg                             newRpc--;
148181254a7Smrg                         }
149181254a7Smrg                         stack.push(tuple(newStart, newEnd, newRpc));
150181254a7Smrg                         r += code[i].data + IRL!(IR.Option);
151181254a7Smrg                         i += code[i].data + IRL!(IR.Option);
152181254a7Smrg                     }
153181254a7Smrg                     pc = i;
154181254a7Smrg                     revPc = revStart;
155181254a7Smrg                     assert(code[pc].code == IR.OrEnd);
156181254a7Smrg                 }
157181254a7Smrg                 else
158181254a7Smrg                     pc += len;
159181254a7Smrg             }
160181254a7Smrg         }
161181254a7Smrg         if (stack.empty)
162181254a7Smrg             break;
163181254a7Smrg         start = stack.top[0];
164181254a7Smrg         end = stack.top[1];
165181254a7Smrg         revPc = stack.top[2];
166181254a7Smrg         stack.pop();
167181254a7Smrg     }
168181254a7Smrg     code[] = rev[];
169181254a7Smrg }
170181254a7Smrg 
171181254a7Smrg struct CodeGen
172181254a7Smrg {
173181254a7Smrg     Bytecode[] ir;                 // resulting bytecode
174181254a7Smrg     Stack!(uint) fixupStack;       // stack of opened start instructions
175181254a7Smrg     NamedGroup[] dict;             // maps name -> user group number
176181254a7Smrg     Stack!(uint) groupStack;       // stack of current number of group
177181254a7Smrg     uint nesting = 0;              // group nesting level and repetitions step
178181254a7Smrg     uint lookaroundNest = 0;       // nesting of lookaround
179181254a7Smrg     uint counterDepth = 0;         // current depth of nested counted repetitions
180181254a7Smrg     CodepointSet[] charsets;       // sets for char classes
181181254a7Smrg     const(CharMatcher)[] matchers; // matchers for char classes
182181254a7Smrg     uint[] backrefed;              // bitarray for groups refered by backref
183181254a7Smrg     uint ngroup;                   // final number of groups (of all patterns)
184181254a7Smrg 
startCodeGen185181254a7Smrg     void start(uint length)
186181254a7Smrg     {
187181254a7Smrg         if (!__ctfe)
188181254a7Smrg             ir.reserve((length*5+2)/4);
189181254a7Smrg         fixupStack.push(0);
190181254a7Smrg         groupStack.push(1);//0 - whole match
191181254a7Smrg     }
192181254a7Smrg 
193181254a7Smrg     //mark referenced groups for latter processing
markBackrefCodeGen194181254a7Smrg     void markBackref(uint n)
195181254a7Smrg     {
196181254a7Smrg         if (n/32 >= backrefed.length)
197181254a7Smrg             backrefed.length = n/32 + 1;
198181254a7Smrg         backrefed[n / 32] |= 1 << (n & 31);
199181254a7Smrg     }
200181254a7Smrg 
isOpenGroupCodeGen201181254a7Smrg     bool isOpenGroup(uint n)
202181254a7Smrg     {
203181254a7Smrg         import std.algorithm.searching : canFind;
204181254a7Smrg         // walk the fixup stack and see if there are groups labeled 'n'
205181254a7Smrg         // fixup '0' is reserved for alternations
206181254a7Smrg         return fixupStack.data[1..$].
207181254a7Smrg             canFind!(fix => ir[fix].code == IR.GroupStart && ir[fix].data == n)();
208181254a7Smrg     }
209181254a7Smrg 
putCodeGen210181254a7Smrg     void put(Bytecode code)
211181254a7Smrg     {
212181254a7Smrg         enforce(ir.length < maxCompiledLength,
213181254a7Smrg             "maximum compiled pattern length is exceeded");
214181254a7Smrg         ir ~= code;
215181254a7Smrg     }
216181254a7Smrg 
putRawCodeGen217181254a7Smrg     void putRaw(uint number)
218181254a7Smrg     {
219181254a7Smrg         enforce(ir.length < maxCompiledLength,
220181254a7Smrg             "maximum compiled pattern length is exceeded");
221181254a7Smrg         ir ~= Bytecode.fromRaw(number);
222181254a7Smrg     }
223181254a7Smrg 
224181254a7Smrg     //try to generate optimal IR code for this CodepointSet
charsetToIrCodeGen225181254a7Smrg     @trusted void charsetToIr(CodepointSet set)
226181254a7Smrg     {//@@@BUG@@@ writeln is @system
227181254a7Smrg         uint chars = cast(uint) set.length;
228181254a7Smrg         if (chars < Bytecode.maxSequence)
229181254a7Smrg         {
230181254a7Smrg             switch (chars)
231181254a7Smrg             {
232181254a7Smrg                 case 1:
233181254a7Smrg                     put(Bytecode(IR.Char, set.byCodepoint.front));
234181254a7Smrg                     break;
235181254a7Smrg                 case 0:
236181254a7Smrg                     throw new RegexException("empty CodepointSet not allowed");
237181254a7Smrg                 default:
238181254a7Smrg                     foreach (ch; set.byCodepoint)
239181254a7Smrg                         put(Bytecode(IR.OrChar, ch, chars));
240181254a7Smrg             }
241181254a7Smrg         }
242181254a7Smrg         else
243181254a7Smrg         {
244181254a7Smrg             import std.algorithm.searching : countUntil;
245181254a7Smrg             const ivals = set.byInterval;
246181254a7Smrg             immutable n = charsets.countUntil(set);
247181254a7Smrg             if (n >= 0)
248181254a7Smrg             {
249181254a7Smrg                 if (ivals.length*2 > maxCharsetUsed)
250181254a7Smrg                     put(Bytecode(IR.Trie, cast(uint) n));
251181254a7Smrg                 else
252181254a7Smrg                     put(Bytecode(IR.CodepointSet, cast(uint) n));
253181254a7Smrg                 return;
254181254a7Smrg             }
255181254a7Smrg             if (ivals.length*2 > maxCharsetUsed)
256181254a7Smrg             {
257181254a7Smrg                 auto t  = getMatcher(set);
258181254a7Smrg                 put(Bytecode(IR.Trie, cast(uint) matchers.length));
259181254a7Smrg                 matchers ~= t;
260181254a7Smrg                 debug(std_regex_allocation) writeln("Trie generated");
261181254a7Smrg             }
262181254a7Smrg             else
263181254a7Smrg             {
264181254a7Smrg                 put(Bytecode(IR.CodepointSet, cast(uint) charsets.length));
265181254a7Smrg                 matchers ~= CharMatcher.init;
266181254a7Smrg             }
267181254a7Smrg             charsets ~= set;
268181254a7Smrg             assert(charsets.length == matchers.length);
269181254a7Smrg         }
270181254a7Smrg     }
271181254a7Smrg 
genLogicGroupCodeGen272181254a7Smrg     void genLogicGroup()
273181254a7Smrg     {
274181254a7Smrg         nesting++;
275181254a7Smrg         pushFixup(length);
276181254a7Smrg         put(Bytecode(IR.Nop, 0));
277181254a7Smrg     }
278181254a7Smrg 
genGroupCodeGen279181254a7Smrg     void genGroup()
280181254a7Smrg     {
281181254a7Smrg         nesting++;
282181254a7Smrg         pushFixup(length);
283181254a7Smrg         immutable nglob = groupStack.top++;
284181254a7Smrg         enforce(groupStack.top <= maxGroupNumber, "limit on number of submatches is exceeded");
285181254a7Smrg         put(Bytecode(IR.GroupStart, nglob));
286181254a7Smrg     }
287181254a7Smrg 
genNamedGroupCodeGen288181254a7Smrg     void genNamedGroup(string name)
289181254a7Smrg     {
290181254a7Smrg         import std.array : insertInPlace;
291181254a7Smrg         import std.range : assumeSorted;
292181254a7Smrg         nesting++;
293181254a7Smrg         pushFixup(length);
294181254a7Smrg         immutable nglob = groupStack.top++;
295181254a7Smrg         enforce(groupStack.top <= maxGroupNumber, "limit on submatches is exceeded");
296181254a7Smrg         auto t = NamedGroup(name, nglob);
297181254a7Smrg         auto d = assumeSorted!"a.name < b.name"(dict);
298181254a7Smrg         immutable ind = d.lowerBound(t).length;
299181254a7Smrg         insertInPlace(dict, ind, t);
300181254a7Smrg         put(Bytecode(IR.GroupStart, nglob));
301181254a7Smrg     }
302181254a7Smrg 
303181254a7Smrg         //generate code for start of lookaround: (?= (?! (?<= (?<!
genLookaroundCodeGen304181254a7Smrg     void genLookaround(IR opcode)
305181254a7Smrg     {
306181254a7Smrg         nesting++;
307181254a7Smrg         pushFixup(length);
308181254a7Smrg         put(Bytecode(opcode, 0));
309181254a7Smrg         put(Bytecode.fromRaw(0));
310181254a7Smrg         put(Bytecode.fromRaw(0));
311181254a7Smrg         groupStack.push(0);
312181254a7Smrg         lookaroundNest++;
313181254a7Smrg         enforce(lookaroundNest <= maxLookaroundDepth,
314181254a7Smrg             "maximum lookaround depth is exceeded");
315181254a7Smrg     }
316181254a7Smrg 
endPatternCodeGen317181254a7Smrg     void endPattern(uint num)
318181254a7Smrg     {
319181254a7Smrg         import std.algorithm.comparison : max;
320181254a7Smrg         put(Bytecode(IR.End, num));
321181254a7Smrg         ngroup = max(ngroup, groupStack.top);
322181254a7Smrg         groupStack.top = 1; // reset group counter
323181254a7Smrg     }
324181254a7Smrg 
325181254a7Smrg     //fixup lookaround with start at offset fix and append a proper *-End opcode
fixLookaroundCodeGen326181254a7Smrg     void fixLookaround(uint fix)
327181254a7Smrg     {
328181254a7Smrg         lookaroundNest--;
329181254a7Smrg         ir[fix] = Bytecode(ir[fix].code,
330181254a7Smrg             cast(uint) ir.length - fix - IRL!(IR.LookaheadStart));
331181254a7Smrg         auto g = groupStack.pop();
332181254a7Smrg         assert(!groupStack.empty);
333181254a7Smrg         ir[fix+1] = Bytecode.fromRaw(groupStack.top);
334181254a7Smrg         //groups are cumulative across lookarounds
335181254a7Smrg         ir[fix+2] = Bytecode.fromRaw(groupStack.top+g);
336181254a7Smrg         groupStack.top += g;
337181254a7Smrg         if (ir[fix].code == IR.LookbehindStart || ir[fix].code == IR.NeglookbehindStart)
338181254a7Smrg         {
339181254a7Smrg             reverseBytecode(ir[fix + IRL!(IR.LookbehindStart) .. $]);
340181254a7Smrg         }
341181254a7Smrg         put(ir[fix].paired);
342181254a7Smrg     }
343181254a7Smrg 
344181254a7Smrg     // repetition of {1,1}
fixRepetitionCodeGen345181254a7Smrg     void fixRepetition(uint offset)
346181254a7Smrg     {
347181254a7Smrg         import std.algorithm.mutation : copy;
348181254a7Smrg         immutable replace = ir[offset].code == IR.Nop;
349181254a7Smrg         if (replace)
350181254a7Smrg         {
351181254a7Smrg             copy(ir[offset + 1 .. $], ir[offset .. $ - 1]);
352181254a7Smrg             ir.length -= 1;
353181254a7Smrg         }
354181254a7Smrg     }
355181254a7Smrg 
356181254a7Smrg     // repetition of {x,y}
fixRepetitionCodeGen357181254a7Smrg     void fixRepetition(uint offset, uint min, uint max, bool greedy)
358181254a7Smrg     {
359181254a7Smrg         static import std.algorithm.comparison;
360181254a7Smrg         import std.algorithm.mutation : copy;
361181254a7Smrg         import std.array : insertInPlace;
362181254a7Smrg         immutable replace = ir[offset].code == IR.Nop;
363181254a7Smrg         immutable len = cast(uint) ir.length - offset - replace;
364181254a7Smrg         if (max != infinite)
365181254a7Smrg         {
366181254a7Smrg             if (min != 1 || max != 1)
367181254a7Smrg             {
368181254a7Smrg                 Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len);
369181254a7Smrg                 if (replace)
370181254a7Smrg                     ir[offset] = op;
371181254a7Smrg                 else
372181254a7Smrg                     insertInPlace(ir, offset, op);
373181254a7Smrg                 put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len));
374181254a7Smrg                 put(Bytecode.init); //hotspot
375181254a7Smrg                 putRaw(1);
376181254a7Smrg                 putRaw(min);
377181254a7Smrg                 putRaw(max);
378181254a7Smrg                 counterDepth = std.algorithm.comparison.max(counterDepth, nesting+1);
379181254a7Smrg             }
380181254a7Smrg         }
381181254a7Smrg         else if (min) //&& max is infinite
382181254a7Smrg         {
383181254a7Smrg             if (min != 1)
384181254a7Smrg             {
385181254a7Smrg                 Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len);
386181254a7Smrg                 if (replace)
387181254a7Smrg                     ir[offset] = op;
388181254a7Smrg                 else
389181254a7Smrg                     insertInPlace(ir, offset, op);
390181254a7Smrg                 offset += 1;//so it still points to the repeated block
391181254a7Smrg                 put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len));
392181254a7Smrg                 put(Bytecode.init); //hotspot
393181254a7Smrg                 putRaw(1);
394181254a7Smrg                 putRaw(min);
395181254a7Smrg                 putRaw(min);
396181254a7Smrg                 counterDepth = std.algorithm.comparison.max(counterDepth, nesting+1);
397181254a7Smrg             }
398181254a7Smrg             else if (replace)
399181254a7Smrg             {
400181254a7Smrg                 copy(ir[offset+1 .. $], ir[offset .. $-1]);
401181254a7Smrg                 ir.length -= 1;
402181254a7Smrg             }
403181254a7Smrg             put(Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len));
404181254a7Smrg             enforce(ir.length + len < maxCompiledLength,  "maximum compiled pattern length is exceeded");
405181254a7Smrg             ir ~= ir[offset .. offset+len];
406181254a7Smrg             //IR.InfinteX is always a hotspot
407181254a7Smrg             put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len));
408181254a7Smrg             put(Bytecode.init); //merge index
409181254a7Smrg         }
410181254a7Smrg         else//vanila {0,inf}
411181254a7Smrg         {
412181254a7Smrg             Bytecode op = Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len);
413181254a7Smrg             if (replace)
414181254a7Smrg                 ir[offset] = op;
415181254a7Smrg             else
416181254a7Smrg                 insertInPlace(ir, offset, op);
417181254a7Smrg             //IR.InfinteX is always a hotspot
418181254a7Smrg             put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len));
419181254a7Smrg             put(Bytecode.init); //merge index
420181254a7Smrg         }
421181254a7Smrg     }
422181254a7Smrg 
fixAlternationCodeGen423181254a7Smrg     void fixAlternation()
424181254a7Smrg     {
425181254a7Smrg         import std.array : insertInPlace;
426181254a7Smrg         uint fix = fixupStack.top;
427181254a7Smrg         if (ir.length > fix && ir[fix].code == IR.Option)
428181254a7Smrg         {
429181254a7Smrg             ir[fix] = Bytecode(ir[fix].code, cast(uint) ir.length - fix);
430181254a7Smrg             put(Bytecode(IR.GotoEndOr, 0));
431181254a7Smrg             fixupStack.top = cast(uint) ir.length; //replace latest fixup for Option
432181254a7Smrg             put(Bytecode(IR.Option, 0));
433181254a7Smrg             return;
434181254a7Smrg         }
435181254a7Smrg         uint len, orStart;
436181254a7Smrg         //start a new option
437181254a7Smrg         if (fixupStack.length == 1)
438181254a7Smrg         {//only root entry, effectively no fixup
439181254a7Smrg             len = cast(uint) ir.length + IRL!(IR.GotoEndOr);
440181254a7Smrg             orStart = 0;
441181254a7Smrg         }
442181254a7Smrg         else
443181254a7Smrg         {//IR.lookahead, etc. fixups that have length > 1, thus check ir[x].length
444181254a7Smrg             len = cast(uint) ir.length - fix - (ir[fix].length - 1);
445181254a7Smrg             orStart = fix + ir[fix].length;
446181254a7Smrg         }
447181254a7Smrg         insertInPlace(ir, orStart, Bytecode(IR.OrStart, 0), Bytecode(IR.Option, len));
448181254a7Smrg         assert(ir[orStart].code == IR.OrStart);
449181254a7Smrg         put(Bytecode(IR.GotoEndOr, 0));
450181254a7Smrg         fixupStack.push(orStart); //fixup for StartOR
451181254a7Smrg         fixupStack.push(cast(uint) ir.length); //for second Option
452181254a7Smrg         put(Bytecode(IR.Option, 0));
453181254a7Smrg     }
454181254a7Smrg 
455181254a7Smrg     // finalizes IR.Option, fix points to the first option of sequence
finishAlternationCodeGen456181254a7Smrg     void finishAlternation(uint fix)
457181254a7Smrg     {
458181254a7Smrg         enforce(ir[fix].code == IR.Option, "no matching ')'");
459181254a7Smrg         ir[fix] = Bytecode(ir[fix].code, cast(uint) ir.length - fix - IRL!(IR.OrStart));
460181254a7Smrg         fix = fixupStack.pop();
461181254a7Smrg         enforce(ir[fix].code == IR.OrStart, "no matching ')'");
462181254a7Smrg         ir[fix] = Bytecode(IR.OrStart, cast(uint) ir.length - fix - IRL!(IR.OrStart));
463181254a7Smrg         put(Bytecode(IR.OrEnd, cast(uint) ir.length - fix - IRL!(IR.OrStart)));
464181254a7Smrg         uint pc = fix + IRL!(IR.OrStart);
465181254a7Smrg         while (ir[pc].code == IR.Option)
466181254a7Smrg         {
467181254a7Smrg             pc = pc + ir[pc].data;
468181254a7Smrg             if (ir[pc].code != IR.GotoEndOr)
469181254a7Smrg                 break;
470181254a7Smrg             ir[pc] = Bytecode(IR.GotoEndOr, cast(uint)(ir.length - pc - IRL!(IR.OrEnd)));
471181254a7Smrg             pc += IRL!(IR.GotoEndOr);
472181254a7Smrg         }
473181254a7Smrg         put(Bytecode.fromRaw(0));
474181254a7Smrg     }
475181254a7Smrg 
476181254a7Smrg     // returns: (flag - repetition possible?, fixup of the start of this "group")
477181254a7Smrg     Tuple!(bool, uint) onClose()
478181254a7Smrg     {
479181254a7Smrg         nesting--;
480181254a7Smrg         uint fix = popFixup();
481181254a7Smrg         switch (ir[fix].code)
482181254a7Smrg         {
483181254a7Smrg         case IR.GroupStart:
484181254a7Smrg             put(Bytecode(IR.GroupEnd, ir[fix].data));
485181254a7Smrg             return tuple(true, fix);
486181254a7Smrg         case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
487181254a7Smrg             assert(lookaroundNest);
488181254a7Smrg             fixLookaround(fix);
489181254a7Smrg             return tuple(false, 0u);
490181254a7Smrg         case IR.Option: //| xxx )
491181254a7Smrg             //two fixups: last option + full OR
492181254a7Smrg             finishAlternation(fix);
493181254a7Smrg             fix = topFixup;
494181254a7Smrg             switch (ir[fix].code)
495181254a7Smrg             {
496181254a7Smrg             case IR.GroupStart:
497181254a7Smrg                 popFixup();
498181254a7Smrg                 put(Bytecode(IR.GroupEnd, ir[fix].data));
499181254a7Smrg                 return tuple(true, fix);
500181254a7Smrg             case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
501181254a7Smrg                 assert(lookaroundNest);
502181254a7Smrg                 fix = popFixup();
503181254a7Smrg                 fixLookaround(fix);
504181254a7Smrg                 return tuple(false, 0u);
505181254a7Smrg             default://(?:xxx)
506181254a7Smrg                 popFixup();
507181254a7Smrg                 return tuple(true, fix);
508181254a7Smrg             }
509181254a7Smrg         default://(?:xxx)
510181254a7Smrg             return tuple(true, fix);
511181254a7Smrg         }
512181254a7Smrg     }
513181254a7Smrg 
popFixupCodeGen514181254a7Smrg     uint popFixup(){ return fixupStack.pop(); }
515181254a7Smrg 
pushFixupCodeGen516181254a7Smrg     void pushFixup(uint val){ return fixupStack.push(val); }
517181254a7Smrg 
topFixupCodeGen518181254a7Smrg     @property uint topFixup(){ return fixupStack.top; }
519181254a7Smrg 
fixupLengthCodeGen520181254a7Smrg     @property size_t fixupLength(){ return fixupStack.data.length; }
521181254a7Smrg 
lengthCodeGen522181254a7Smrg     @property uint length(){ return cast(uint) ir.length; }
523181254a7Smrg }
524181254a7Smrg 
525181254a7Smrg // safety limits
526181254a7Smrg enum maxGroupNumber = 2^^19;
527181254a7Smrg enum maxLookaroundDepth = 16;
528181254a7Smrg // *Bytecode.sizeof, i.e. 1Mb of bytecode alone
529181254a7Smrg enum maxCompiledLength = 2^^18;
530181254a7Smrg // amounts to up to 4 Mb of auxilary table for matching
531181254a7Smrg enum maxCumulativeRepetitionLength = 2^^20;
532181254a7Smrg // marker to indicate infinite repetition
533181254a7Smrg enum infinite = ~0u;
534181254a7Smrg 
535181254a7Smrg struct Parser(R, Generator)
536181254a7Smrg if (isForwardRange!R && is(ElementType!R : dchar))
537181254a7Smrg {
538*b1e83836Smrg     dchar front;
539181254a7Smrg     bool empty;
540181254a7Smrg     R pat, origin;       //keep full pattern for pretty printing error messages
541181254a7Smrg     uint re_flags = 0;   //global flags e.g. multiline + internal ones
542181254a7Smrg     Generator g;
543181254a7Smrg 
544181254a7Smrg     @trusted this(S)(R pattern, S flags)
545181254a7Smrg         if (isSomeString!S)
546181254a7Smrg     {
547181254a7Smrg         pat = origin = pattern;
548181254a7Smrg         //reserve slightly more then avg as sampled from unittests
549181254a7Smrg         parseFlags(flags);
550*b1e83836Smrg         front = ' ';//a safe default for freeform parsing
551*b1e83836Smrg         popFront();
552181254a7Smrg         g.start(cast(uint) pat.length);
553181254a7Smrg         try
554181254a7Smrg         {
555181254a7Smrg             parseRegex();
556181254a7Smrg         }
catch(Exception e)557181254a7Smrg         catch (Exception e)
558181254a7Smrg         {
559181254a7Smrg             error(e.msg);//also adds pattern location
560181254a7Smrg         }
561181254a7Smrg         g.endPattern(1);
562181254a7Smrg     }
563181254a7Smrg 
_popFront()564*b1e83836Smrg     void _popFront()
565181254a7Smrg     {
566181254a7Smrg         if (pat.empty)
567181254a7Smrg         {
568181254a7Smrg             empty =  true;
569181254a7Smrg         }
570*b1e83836Smrg         else
571*b1e83836Smrg         {
572*b1e83836Smrg             front = pat.front;
573181254a7Smrg             pat.popFront();
574*b1e83836Smrg         }
575181254a7Smrg     }
576181254a7Smrg 
skipSpace()577181254a7Smrg     void skipSpace()
578181254a7Smrg     {
579*b1e83836Smrg         while (!empty && isWhite(front)) _popFront();
580181254a7Smrg     }
581181254a7Smrg 
popFront()582*b1e83836Smrg     void popFront()
583181254a7Smrg     {
584*b1e83836Smrg         _popFront();
585*b1e83836Smrg         if (re_flags & RegexOption.freeform) skipSpace();
586181254a7Smrg     }
587*b1e83836Smrg 
save()588*b1e83836Smrg     auto save(){ return this; }
589181254a7Smrg 
590181254a7Smrg     //parsing number with basic overflow check
parseDecimal()591181254a7Smrg     uint parseDecimal()
592181254a7Smrg     {
593181254a7Smrg         uint r = 0;
594*b1e83836Smrg         while (std.ascii.isDigit(front))
595181254a7Smrg         {
596181254a7Smrg             if (r >= (uint.max/10))
597181254a7Smrg                 error("Overflow in decimal number");
598*b1e83836Smrg             r = 10*r + cast(uint)(front-'0');
599*b1e83836Smrg             popFront();
600*b1e83836Smrg             if (empty) break;
601181254a7Smrg         }
602181254a7Smrg         return r;
603181254a7Smrg     }
604181254a7Smrg 
605181254a7Smrg     //
parseFlags(S)606181254a7Smrg     @trusted void parseFlags(S)(S flags)
607181254a7Smrg     {//@@@BUG@@@ text is @system
608181254a7Smrg         import std.conv : text;
609181254a7Smrg         foreach (ch; flags)//flags are ASCII anyway
610181254a7Smrg         {
611181254a7Smrg         L_FlagSwitch:
612181254a7Smrg             switch (ch)
613181254a7Smrg             {
614181254a7Smrg 
615181254a7Smrg                 foreach (i, op; __traits(allMembers, RegexOption))
616181254a7Smrg                 {
617181254a7Smrg                     case RegexOptionNames[i]:
618181254a7Smrg                             if (re_flags & mixin("RegexOption."~op))
619181254a7Smrg                                 throw new RegexException(text("redundant flag specified: ",ch));
620181254a7Smrg                             re_flags |= mixin("RegexOption."~op);
621181254a7Smrg                             break L_FlagSwitch;
622181254a7Smrg                 }
623181254a7Smrg                 default:
624181254a7Smrg                     throw new RegexException(text("unknown regex flag '",ch,"'"));
625181254a7Smrg             }
626181254a7Smrg         }
627181254a7Smrg     }
628181254a7Smrg 
629181254a7Smrg     //parse and store IR for regex pattern
parseRegex()630181254a7Smrg     @trusted void parseRegex()
631181254a7Smrg     {
632181254a7Smrg         uint fix;//fixup pointer
633181254a7Smrg 
634181254a7Smrg         while (!empty)
635181254a7Smrg         {
636181254a7Smrg             debug(std_regex_parser)
637181254a7Smrg                 __ctfe || writeln("*LR*\nSource: ", pat, "\nStack: ",fixupStack.data);
638*b1e83836Smrg             switch (front)
639181254a7Smrg             {
640181254a7Smrg             case '(':
641*b1e83836Smrg                 popFront();
642*b1e83836Smrg                 if (front == '?')
643181254a7Smrg                 {
644*b1e83836Smrg                     popFront();
645*b1e83836Smrg                     switch (front)
646181254a7Smrg                     {
647181254a7Smrg                     case '#':
648181254a7Smrg                         for (;;)
649181254a7Smrg                         {
650*b1e83836Smrg                             popFront();
651*b1e83836Smrg                             enforce(!empty, "Unexpected end of pattern");
652*b1e83836Smrg                             if (front == ')')
653181254a7Smrg                             {
654*b1e83836Smrg                                 popFront();
655181254a7Smrg                                 break;
656181254a7Smrg                             }
657181254a7Smrg                         }
658181254a7Smrg                         break;
659181254a7Smrg                     case ':':
660181254a7Smrg                         g.genLogicGroup();
661*b1e83836Smrg                         popFront();
662181254a7Smrg                         break;
663181254a7Smrg                     case '=':
664181254a7Smrg                         g.genLookaround(IR.LookaheadStart);
665*b1e83836Smrg                         popFront();
666181254a7Smrg                         break;
667181254a7Smrg                     case '!':
668181254a7Smrg                         g.genLookaround(IR.NeglookaheadStart);
669*b1e83836Smrg                         popFront();
670181254a7Smrg                         break;
671181254a7Smrg                     case 'P':
672*b1e83836Smrg                         popFront();
673*b1e83836Smrg                         enforce(front == '<', "Expected '<' in named group");
674181254a7Smrg                         string name;
675*b1e83836Smrg                         popFront();
676*b1e83836Smrg                         if (empty || !(isAlpha(front) || front == '_'))
677181254a7Smrg                             error("Expected alpha starting a named group");
678*b1e83836Smrg                         name ~= front;
679*b1e83836Smrg                         popFront();
680*b1e83836Smrg                         while (!empty && (isAlpha(front) ||
681*b1e83836Smrg                             front == '_' || std.ascii.isDigit(front)))
682181254a7Smrg                         {
683*b1e83836Smrg                             name ~= front;
684*b1e83836Smrg                             popFront();
685181254a7Smrg                         }
686*b1e83836Smrg                         enforce(front == '>', "Expected '>' closing named group");
687*b1e83836Smrg                         popFront();
688181254a7Smrg                         g.genNamedGroup(name);
689181254a7Smrg                         break;
690181254a7Smrg                     case '<':
691*b1e83836Smrg                         popFront();
692*b1e83836Smrg                         if (front == '=')
693181254a7Smrg                             g.genLookaround(IR.LookbehindStart);
694*b1e83836Smrg                         else if (front == '!')
695181254a7Smrg                             g.genLookaround(IR.NeglookbehindStart);
696181254a7Smrg                         else
697181254a7Smrg                             error("'!' or '=' expected after '<'");
698*b1e83836Smrg                         popFront();
699181254a7Smrg                         break;
700181254a7Smrg                     default:
701181254a7Smrg                         uint enableFlags, disableFlags;
702181254a7Smrg                         bool enable = true;
703181254a7Smrg                         do
704181254a7Smrg                         {
705*b1e83836Smrg                             switch (front)
706181254a7Smrg                             {
707181254a7Smrg                             case 's':
708181254a7Smrg                                 if (enable)
709181254a7Smrg                                     enableFlags |= RegexOption.singleline;
710181254a7Smrg                                 else
711181254a7Smrg                                     disableFlags |= RegexOption.singleline;
712181254a7Smrg                                 break;
713181254a7Smrg                             case 'x':
714181254a7Smrg                                 if (enable)
715181254a7Smrg                                     enableFlags |= RegexOption.freeform;
716181254a7Smrg                                 else
717181254a7Smrg                                     disableFlags |= RegexOption.freeform;
718181254a7Smrg                                 break;
719181254a7Smrg                             case 'i':
720181254a7Smrg                                 if (enable)
721181254a7Smrg                                     enableFlags |= RegexOption.casefold;
722181254a7Smrg                                 else
723181254a7Smrg                                     disableFlags |= RegexOption.casefold;
724181254a7Smrg                                 break;
725181254a7Smrg                             case 'm':
726181254a7Smrg                                 if (enable)
727181254a7Smrg                                     enableFlags |= RegexOption.multiline;
728181254a7Smrg                                 else
729181254a7Smrg                                     disableFlags |= RegexOption.multiline;
730181254a7Smrg                                 break;
731181254a7Smrg                             case '-':
732181254a7Smrg                                 if (!enable)
733181254a7Smrg                                     error(" unexpected second '-' in flags");
734181254a7Smrg                                 enable = false;
735181254a7Smrg                                 break;
736181254a7Smrg                             default:
737181254a7Smrg                                 error(" 's', 'x', 'i', 'm' or '-' expected after '(?' ");
738181254a7Smrg                             }
739*b1e83836Smrg                             popFront();
740*b1e83836Smrg                         }while (front != ')');
741*b1e83836Smrg                         popFront();
742181254a7Smrg                         re_flags |= enableFlags;
743181254a7Smrg                         re_flags &= ~disableFlags;
744181254a7Smrg                     }
745181254a7Smrg                 }
746181254a7Smrg                 else
747181254a7Smrg                 {
748181254a7Smrg                     g.genGroup();
749181254a7Smrg                 }
750181254a7Smrg                 break;
751181254a7Smrg             case ')':
752181254a7Smrg                 enforce(g.nesting, "Unmatched ')'");
753*b1e83836Smrg                 popFront();
754181254a7Smrg                 auto pair = g.onClose();
755181254a7Smrg                 if (pair[0])
756181254a7Smrg                     parseQuantifier(pair[1]);
757181254a7Smrg                 break;
758181254a7Smrg             case '|':
759*b1e83836Smrg                 popFront();
760181254a7Smrg                 g.fixAlternation();
761181254a7Smrg                 break;
762181254a7Smrg             default://no groups or whatever
763181254a7Smrg                 immutable start = g.length;
764181254a7Smrg                 parseAtom();
765181254a7Smrg                 parseQuantifier(start);
766181254a7Smrg             }
767181254a7Smrg         }
768181254a7Smrg 
769181254a7Smrg         if (g.fixupLength != 1)
770181254a7Smrg         {
771181254a7Smrg             fix = g.popFixup();
772181254a7Smrg             g.finishAlternation(fix);
773181254a7Smrg             enforce(g.fixupLength == 1, "no matching ')'");
774181254a7Smrg         }
775181254a7Smrg     }
776181254a7Smrg 
777181254a7Smrg 
778181254a7Smrg     //parse and store IR for atom-quantifier pair
parseQuantifier(uint offset)779181254a7Smrg     @trusted void parseQuantifier(uint offset)
780181254a7Smrg     {//copy is @system
781181254a7Smrg         if (empty)
782181254a7Smrg             return g.fixRepetition(offset);
783181254a7Smrg         uint min, max;
784*b1e83836Smrg         switch (front)
785181254a7Smrg         {
786181254a7Smrg         case '*':
787181254a7Smrg             min = 0;
788181254a7Smrg             max = infinite;
789181254a7Smrg             break;
790181254a7Smrg         case '?':
791181254a7Smrg             min = 0;
792181254a7Smrg             max = 1;
793181254a7Smrg             break;
794181254a7Smrg         case '+':
795181254a7Smrg             min = 1;
796181254a7Smrg             max = infinite;
797181254a7Smrg             break;
798181254a7Smrg         case '{':
799*b1e83836Smrg             popFront();
800*b1e83836Smrg             enforce(!empty, "Unexpected end of regex pattern");
801*b1e83836Smrg             enforce(std.ascii.isDigit(front), "First number required in repetition");
802181254a7Smrg             min = parseDecimal();
803*b1e83836Smrg             if (front == '}')
804181254a7Smrg                 max = min;
805*b1e83836Smrg             else if (front == ',')
806181254a7Smrg             {
807*b1e83836Smrg                 popFront();
808*b1e83836Smrg                 if (std.ascii.isDigit(front))
809181254a7Smrg                     max = parseDecimal();
810*b1e83836Smrg                 else if (front == '}')
811181254a7Smrg                     max = infinite;
812181254a7Smrg                 else
813181254a7Smrg                     error("Unexpected symbol in regex pattern");
814181254a7Smrg                 skipSpace();
815*b1e83836Smrg                 enforce(front == '}', "Unmatched '{' in regex pattern");
816181254a7Smrg             }
817181254a7Smrg             else
818181254a7Smrg                 error("Unexpected symbol in regex pattern");
819*b1e83836Smrg             enforce(min <= max, "Illegal {n,m} quantifier");
820181254a7Smrg             break;
821181254a7Smrg         default:
822181254a7Smrg             g.fixRepetition(offset);
823181254a7Smrg             return;
824181254a7Smrg         }
825181254a7Smrg         bool greedy = true;
826181254a7Smrg         //check only if we managed to get new symbol
827*b1e83836Smrg         popFront();
828*b1e83836Smrg         if (!empty && front == '?')
829181254a7Smrg         {
830181254a7Smrg             greedy = false;
831*b1e83836Smrg             popFront();
832181254a7Smrg         }
833181254a7Smrg         g.fixRepetition(offset, min, max, greedy);
834181254a7Smrg     }
835181254a7Smrg 
836181254a7Smrg     //parse and store IR for atom
parseAtom()837181254a7Smrg     void parseAtom()
838181254a7Smrg     {
839181254a7Smrg         if (empty)
840181254a7Smrg             return;
841*b1e83836Smrg         switch (front)
842181254a7Smrg         {
843181254a7Smrg         case '*', '?', '+', '|', '{', '}':
844*b1e83836Smrg             return error("'*', '+', '?', '{', '}' not allowed in atom");
845181254a7Smrg         case '.':
846181254a7Smrg             if (re_flags & RegexOption.singleline)
847181254a7Smrg                 g.put(Bytecode(IR.Any, 0));
848181254a7Smrg             else
849181254a7Smrg             {
850181254a7Smrg                 CodepointSet set;
851181254a7Smrg                 g.charsetToIr(set.add('\n','\n'+1).add('\r', '\r'+1).inverted);
852181254a7Smrg             }
853*b1e83836Smrg             popFront();
854181254a7Smrg             break;
855181254a7Smrg         case '[':
856181254a7Smrg             parseCharset();
857181254a7Smrg             break;
858181254a7Smrg         case '\\':
859*b1e83836Smrg             _popFront();
860*b1e83836Smrg             enforce(!empty, "Unfinished escape sequence");
861181254a7Smrg             parseEscape();
862181254a7Smrg             break;
863181254a7Smrg         case '^':
864181254a7Smrg             if (re_flags & RegexOption.multiline)
865181254a7Smrg                 g.put(Bytecode(IR.Bol, 0));
866181254a7Smrg             else
867181254a7Smrg                 g.put(Bytecode(IR.Bof, 0));
868*b1e83836Smrg             popFront();
869181254a7Smrg             break;
870181254a7Smrg         case '$':
871181254a7Smrg             if (re_flags & RegexOption.multiline)
872181254a7Smrg                 g.put(Bytecode(IR.Eol, 0));
873181254a7Smrg             else
874181254a7Smrg                 g.put(Bytecode(IR.Eof, 0));
875*b1e83836Smrg             popFront();
876181254a7Smrg             break;
877181254a7Smrg         default:
878181254a7Smrg             if (re_flags & RegexOption.casefold)
879181254a7Smrg             {
880*b1e83836Smrg                 auto range = simpleCaseFoldings(front);
881181254a7Smrg                 assert(range.length <= 5);
882181254a7Smrg                 if (range.length == 1)
883181254a7Smrg                     g.put(Bytecode(IR.Char, range.front));
884181254a7Smrg                 else
885181254a7Smrg                     foreach (v; range)
886181254a7Smrg                         g.put(Bytecode(IR.OrChar, v, cast(uint) range.length));
887181254a7Smrg             }
888181254a7Smrg             else
889*b1e83836Smrg                 g.put(Bytecode(IR.Char, front));
890*b1e83836Smrg             popFront();
891181254a7Smrg         }
892181254a7Smrg     }
893181254a7Smrg 
894181254a7Smrg     //parse and store IR for CodepointSet
parseCharset()895181254a7Smrg     void parseCharset()
896181254a7Smrg     {
897181254a7Smrg         const save = re_flags;
898181254a7Smrg         re_flags &= ~RegexOption.freeform; // stop ignoring whitespace if we did
899*b1e83836Smrg         bool casefold = cast(bool)(re_flags & RegexOption.casefold);
900*b1e83836Smrg         g.charsetToIr(unicode.parseSet(this, casefold));
901181254a7Smrg         re_flags = save;
902*b1e83836Smrg         // Last next() in parseCharset is executed w/o freeform flag
903181254a7Smrg         if (re_flags & RegexOption.freeform) skipSpace();
904181254a7Smrg     }
905181254a7Smrg 
906181254a7Smrg     //parse and generate IR for escape stand alone escape sequence
parseEscape()907181254a7Smrg     @trusted void parseEscape()
908181254a7Smrg     {//accesses array of appender
909181254a7Smrg         import std.algorithm.iteration : sum;
910*b1e83836Smrg         switch (front)
911181254a7Smrg         {
912*b1e83836Smrg         case 'f':   popFront(); g.put(Bytecode(IR.Char, '\f')); break;
913*b1e83836Smrg         case 'n':   popFront(); g.put(Bytecode(IR.Char, '\n')); break;
914*b1e83836Smrg         case 'r':   popFront(); g.put(Bytecode(IR.Char, '\r')); break;
915*b1e83836Smrg         case 't':   popFront(); g.put(Bytecode(IR.Char, '\t')); break;
916*b1e83836Smrg         case 'v':   popFront(); g.put(Bytecode(IR.Char, '\v')); break;
917181254a7Smrg 
918181254a7Smrg         case 'd':
919*b1e83836Smrg             popFront();
920181254a7Smrg             g.charsetToIr(unicode.Nd);
921181254a7Smrg             break;
922181254a7Smrg         case 'D':
923*b1e83836Smrg             popFront();
924181254a7Smrg             g.charsetToIr(unicode.Nd.inverted);
925181254a7Smrg             break;
926*b1e83836Smrg         case 'b':   popFront(); g.put(Bytecode(IR.Wordboundary, 0)); break;
927*b1e83836Smrg         case 'B':   popFront(); g.put(Bytecode(IR.Notwordboundary, 0)); break;
928181254a7Smrg         case 's':
929*b1e83836Smrg             popFront();
930181254a7Smrg             g.charsetToIr(unicode.White_Space);
931181254a7Smrg             break;
932181254a7Smrg         case 'S':
933*b1e83836Smrg             popFront();
934181254a7Smrg             g.charsetToIr(unicode.White_Space.inverted);
935181254a7Smrg             break;
936181254a7Smrg         case 'w':
937*b1e83836Smrg             popFront();
938181254a7Smrg             g.charsetToIr(wordCharacter);
939181254a7Smrg             break;
940181254a7Smrg         case 'W':
941*b1e83836Smrg             popFront();
942181254a7Smrg             g.charsetToIr(wordCharacter.inverted);
943181254a7Smrg             break;
944181254a7Smrg         case 'p': case 'P':
945*b1e83836Smrg             bool casefold = cast(bool)(re_flags & RegexOption.casefold);
946*b1e83836Smrg             auto set = unicode.parsePropertySpec(this, front == 'P', casefold);
947*b1e83836Smrg             g.charsetToIr(set);
948181254a7Smrg             break;
949181254a7Smrg         case 'x':
950181254a7Smrg             immutable code = parseUniHex(pat, 2);
951*b1e83836Smrg             popFront();
952181254a7Smrg             g.put(Bytecode(IR.Char,code));
953181254a7Smrg             break;
954181254a7Smrg         case 'u': case 'U':
955*b1e83836Smrg             immutable code = parseUniHex(pat, front == 'u' ? 4 : 8);
956*b1e83836Smrg             popFront();
957181254a7Smrg             g.put(Bytecode(IR.Char, code));
958181254a7Smrg             break;
959181254a7Smrg         case 'c': //control codes
960*b1e83836Smrg             Bytecode code = Bytecode(IR.Char, unicode.parseControlCode(this));
961*b1e83836Smrg             popFront();
962181254a7Smrg             g.put(code);
963181254a7Smrg             break;
964181254a7Smrg         case '0':
965*b1e83836Smrg             popFront();
966181254a7Smrg             g.put(Bytecode(IR.Char, 0));//NUL character
967181254a7Smrg             break;
968181254a7Smrg         case '1': .. case '9':
969*b1e83836Smrg             uint nref = cast(uint) front - '0';
970181254a7Smrg             immutable maxBackref = sum(g.groupStack.data);
971181254a7Smrg             enforce(nref < maxBackref, "Backref to unseen group");
972181254a7Smrg             //perl's disambiguation rule i.e.
973181254a7Smrg             //get next digit only if there is such group number
974*b1e83836Smrg             popFront();
975*b1e83836Smrg             while (nref < maxBackref && !empty && std.ascii.isDigit(front))
976181254a7Smrg             {
977*b1e83836Smrg                 nref = nref * 10 + front - '0';
978*b1e83836Smrg                 popFront();
979181254a7Smrg             }
980181254a7Smrg             if (nref >= maxBackref)
981181254a7Smrg                 nref /= 10;
982181254a7Smrg             enforce(!g.isOpenGroup(nref), "Backref to open group");
983181254a7Smrg             uint localLimit = maxBackref - g.groupStack.top;
984181254a7Smrg             if (nref >= localLimit)
985181254a7Smrg             {
986181254a7Smrg                 g.put(Bytecode(IR.Backref, nref-localLimit));
987181254a7Smrg                 g.ir[$-1].setLocalRef();
988181254a7Smrg             }
989181254a7Smrg             else
990181254a7Smrg                 g.put(Bytecode(IR.Backref, nref));
991181254a7Smrg             g.markBackref(nref);
992181254a7Smrg             break;
993181254a7Smrg         default:
994*b1e83836Smrg             if (front == '\\' && !pat.empty)
995181254a7Smrg             {
996*b1e83836Smrg                 if (pat.front >= privateUseStart && pat.front <= privateUseEnd)
997181254a7Smrg                     enforce(false, "invalid escape sequence");
998181254a7Smrg             }
999*b1e83836Smrg             if (front >= privateUseStart && front <= privateUseEnd)
1000181254a7Smrg             {
1001*b1e83836Smrg                 g.endPattern(front - privateUseStart + 1);
1002181254a7Smrg                 break;
1003181254a7Smrg             }
1004*b1e83836Smrg             auto op = Bytecode(IR.Char, front);
1005*b1e83836Smrg             popFront();
1006181254a7Smrg             g.put(op);
1007181254a7Smrg         }
1008181254a7Smrg     }
1009181254a7Smrg 
1010181254a7Smrg     //
error(string msg)1011181254a7Smrg     @trusted void error(string msg)
1012181254a7Smrg     {
1013181254a7Smrg         import std.array : appender;
1014*b1e83836Smrg         import std.format.write : formattedWrite;
1015181254a7Smrg         auto app = appender!string();
1016181254a7Smrg         formattedWrite(app, "%s\nPattern with error: `%s` <--HERE-- `%s`",
1017181254a7Smrg                        msg, origin[0..$-pat.length], pat);
1018181254a7Smrg         throw new RegexException(app.data);
1019181254a7Smrg     }
1020181254a7Smrg 
1021181254a7Smrg     alias Char = BasicElementOf!R;
1022181254a7Smrg 
program()1023181254a7Smrg     @property program()
1024181254a7Smrg     {
1025181254a7Smrg         return makeRegex(this);
1026181254a7Smrg     }
1027181254a7Smrg }
1028181254a7Smrg 
1029181254a7Smrg /+
1030181254a7Smrg     Postproces the IR, then optimize.
1031181254a7Smrg +/
postprocess(Char)1032181254a7Smrg @trusted void postprocess(Char)(ref Regex!Char zis)
1033181254a7Smrg {//@@@BUG@@@ write is @system
1034181254a7Smrg     with(zis)
1035181254a7Smrg     {
1036181254a7Smrg         struct FixedStack(T)
1037181254a7Smrg         {
1038181254a7Smrg             T[] arr;
1039181254a7Smrg             uint _top;
1040181254a7Smrg             //this(T[] storage){   arr = storage; _top = -1; }
1041181254a7Smrg             @property ref T top(){  assert(!empty); return arr[_top]; }
1042181254a7Smrg             void push(T x){  arr[++_top] = x; }
1043181254a7Smrg             T pop() { assert(!empty);   return arr[_top--]; }
1044181254a7Smrg             @property bool empty(){   return _top == -1; }
1045181254a7Smrg         }
1046181254a7Smrg         auto counterRange = FixedStack!uint(new uint[maxCounterDepth+1], -1);
1047181254a7Smrg         counterRange.push(1);
1048181254a7Smrg         ulong cumRange = 0;
1049181254a7Smrg         for (uint i = 0; i < ir.length; i += ir[i].length)
1050181254a7Smrg         {
1051181254a7Smrg             if (ir[i].hotspot)
1052181254a7Smrg             {
1053181254a7Smrg                 assert(i + 1 < ir.length,
1054181254a7Smrg                     "unexpected end of IR while looking for hotspot");
1055181254a7Smrg                 ir[i+1] = Bytecode.fromRaw(hotspotTableSize);
1056181254a7Smrg                 hotspotTableSize += counterRange.top;
1057181254a7Smrg             }
1058181254a7Smrg             switch (ir[i].code)
1059181254a7Smrg             {
1060181254a7Smrg             case IR.RepeatStart, IR.RepeatQStart:
1061181254a7Smrg                 uint repEnd = cast(uint)(i + ir[i].data + IRL!(IR.RepeatStart));
1062181254a7Smrg                 assert(ir[repEnd].code == ir[i].paired.code);
1063181254a7Smrg                 immutable max = ir[repEnd + 4].raw;
1064181254a7Smrg                 ir[repEnd+2].raw = counterRange.top;
1065181254a7Smrg                 ir[repEnd+3].raw *= counterRange.top;
1066181254a7Smrg                 ir[repEnd+4].raw *= counterRange.top;
1067181254a7Smrg                 ulong cntRange = cast(ulong)(max)*counterRange.top;
1068181254a7Smrg                 cumRange += cntRange;
1069181254a7Smrg                 enforce(cumRange < maxCumulativeRepetitionLength,
1070181254a7Smrg                     "repetition length limit is exceeded");
1071181254a7Smrg                 counterRange.push(cast(uint) cntRange + counterRange.top);
1072181254a7Smrg                 threadCount += counterRange.top;
1073181254a7Smrg                 break;
1074181254a7Smrg             case IR.RepeatEnd, IR.RepeatQEnd:
1075181254a7Smrg                 threadCount += counterRange.top;
1076181254a7Smrg                 counterRange.pop();
1077181254a7Smrg                 break;
1078181254a7Smrg             case IR.GroupStart:
1079181254a7Smrg                 if (isBackref(ir[i].data))
1080181254a7Smrg                     ir[i].setBackrefence();
1081181254a7Smrg                 threadCount += counterRange.top;
1082181254a7Smrg                 break;
1083181254a7Smrg             case IR.GroupEnd:
1084181254a7Smrg                 if (isBackref(ir[i].data))
1085181254a7Smrg                     ir[i].setBackrefence();
1086181254a7Smrg                 threadCount += counterRange.top;
1087181254a7Smrg                 break;
1088181254a7Smrg             default:
1089181254a7Smrg                 threadCount += counterRange.top;
1090181254a7Smrg             }
1091181254a7Smrg         }
1092181254a7Smrg         checkIfOneShot();
1093181254a7Smrg         if (!(flags & RegexInfo.oneShot))
1094181254a7Smrg             kickstart = Kickstart!Char(zis, new uint[](256));
1095181254a7Smrg         debug(std_regex_allocation) writefln("IR processed, max threads: %d", threadCount);
1096181254a7Smrg         optimize(zis);
1097181254a7Smrg     }
1098181254a7Smrg }
1099181254a7Smrg 
fixupBytecode()1100181254a7Smrg void fixupBytecode()(Bytecode[] ir)
1101181254a7Smrg {
1102181254a7Smrg     Stack!uint fixups;
1103181254a7Smrg 
1104181254a7Smrg     with(IR) for (uint i=0; i<ir.length; i+= ir[i].length)
1105181254a7Smrg     {
1106181254a7Smrg         if (ir[i].isStart || ir[i].code == Option)
1107181254a7Smrg             fixups.push(i);
1108181254a7Smrg         else if (ir[i].code == OrEnd)
1109181254a7Smrg         {
1110181254a7Smrg             // Alternatives need more care
1111181254a7Smrg             auto j = fixups.pop(); // last Option
1112181254a7Smrg             ir[j].data = i -  j - ir[j].length;
1113181254a7Smrg             j = fixups.pop(); // OrStart
1114181254a7Smrg             ir[j].data = i - j - ir[j].length;
1115181254a7Smrg             ir[i].data = ir[j].data;
1116181254a7Smrg 
1117181254a7Smrg             // fixup all GotoEndOrs
1118181254a7Smrg             j = j + IRL!(OrStart);
1119181254a7Smrg             assert(ir[j].code == Option);
1120181254a7Smrg             for (;;)
1121181254a7Smrg             {
1122181254a7Smrg                 auto next = j + ir[j].data + IRL!(Option);
1123181254a7Smrg                 if (ir[next].code == IR.OrEnd)
1124181254a7Smrg                     break;
1125181254a7Smrg                 ir[next - IRL!(GotoEndOr)].data = i - next;
1126181254a7Smrg                 j = next;
1127181254a7Smrg             }
1128181254a7Smrg         }
1129181254a7Smrg         else if (ir[i].code == GotoEndOr)
1130181254a7Smrg         {
1131181254a7Smrg             auto j = fixups.pop(); // Option
1132181254a7Smrg             ir[j].data = i - j + IRL!(GotoEndOr)- IRL!(Option); // to the next option
1133181254a7Smrg         }
1134181254a7Smrg         else if (ir[i].isEnd)
1135181254a7Smrg         {
1136181254a7Smrg             auto j = fixups.pop();
1137181254a7Smrg             ir[i].data = i - j - ir[j].length;
1138181254a7Smrg             ir[j].data = ir[i].data;
1139181254a7Smrg         }
1140181254a7Smrg     }
1141181254a7Smrg     assert(fixups.empty);
1142181254a7Smrg }
1143181254a7Smrg 
optimize(Char)1144181254a7Smrg void optimize(Char)(ref Regex!Char zis)
1145181254a7Smrg {
1146181254a7Smrg     import std.array : insertInPlace;
1147181254a7Smrg     CodepointSet nextSet(uint idx)
1148181254a7Smrg     {
1149181254a7Smrg         CodepointSet set;
1150181254a7Smrg         with(zis) with(IR)
1151181254a7Smrg     Outer:
1152181254a7Smrg         for (uint i = idx; i < ir.length; i += ir[i].length)
1153181254a7Smrg         {
1154181254a7Smrg             switch (ir[i].code)
1155181254a7Smrg             {
1156181254a7Smrg                 case Char:
1157181254a7Smrg                     set.add(ir[i].data, ir[i].data+1);
1158181254a7Smrg                     goto default;
1159181254a7Smrg                 //TODO: OrChar
1160181254a7Smrg                 case Trie, CodepointSet:
1161181254a7Smrg                     set = zis.charsets[ir[i].data];
1162181254a7Smrg                     goto default;
1163181254a7Smrg                 case GroupStart,GroupEnd:
1164181254a7Smrg                     break;
1165181254a7Smrg                 default:
1166181254a7Smrg                     break Outer;
1167181254a7Smrg             }
1168181254a7Smrg         }
1169181254a7Smrg         return set;
1170181254a7Smrg     }
1171181254a7Smrg 
1172181254a7Smrg     with(zis) with(IR) for (uint i = 0; i < ir.length; i += ir[i].length)
1173181254a7Smrg     {
1174181254a7Smrg         if (ir[i].code == InfiniteEnd)
1175181254a7Smrg         {
1176181254a7Smrg             auto set = nextSet(i+IRL!(InfiniteEnd));
1177181254a7Smrg             if (!set.empty && set.length < 10_000)
1178181254a7Smrg             {
1179181254a7Smrg                 ir[i] = Bytecode(InfiniteBloomEnd, ir[i].data);
1180181254a7Smrg                 ir[i - ir[i].data - IRL!(InfiniteStart)] =
1181181254a7Smrg                     Bytecode(InfiniteBloomStart, ir[i].data);
1182181254a7Smrg                 ir.insertInPlace(i+IRL!(InfiniteEnd),
1183181254a7Smrg                     Bytecode.fromRaw(cast(uint) zis.filters.length));
1184181254a7Smrg                 zis.filters ~= BitTable(set);
1185181254a7Smrg                 fixupBytecode(ir);
1186181254a7Smrg             }
1187181254a7Smrg         }
1188181254a7Smrg     }
1189181254a7Smrg }
1190181254a7Smrg 
1191181254a7Smrg //IR code validator - proper nesting, illegal instructions, etc.
validateRe(Char)1192181254a7Smrg @trusted void validateRe(Char)(ref Regex!Char zis)
1193181254a7Smrg {//@@@BUG@@@ text is @system
1194181254a7Smrg     import std.conv : text;
1195181254a7Smrg     with(zis)
1196181254a7Smrg     {
1197181254a7Smrg         for (uint pc = 0; pc < ir.length; pc += ir[pc].length)
1198181254a7Smrg         {
1199181254a7Smrg             if (ir[pc].isStart || ir[pc].isEnd)
1200181254a7Smrg             {
1201181254a7Smrg                 immutable dest = ir[pc].indexOfPair(pc);
1202181254a7Smrg                 assert(dest < ir.length, text("Wrong length in opcode at pc=",
1203181254a7Smrg                     pc, " ", dest, " vs ", ir.length));
1204181254a7Smrg                 assert(ir[dest].paired ==  ir[pc],
1205181254a7Smrg                     text("Wrong pairing of opcodes at pc=", pc, "and pc=", dest));
1206181254a7Smrg             }
1207181254a7Smrg             else if (ir[pc].isAtom)
1208181254a7Smrg             {
1209181254a7Smrg 
1210181254a7Smrg             }
1211181254a7Smrg             else
1212181254a7Smrg                assert(0, text("Unknown type of instruction at pc=", pc));
1213181254a7Smrg         }
1214181254a7Smrg     }
1215181254a7Smrg }
1216