xref: /netbsd-src/external/gpl3/gcc/dist/libphobos/src/std/regex/internal/ir.d (revision b1e838363e3c6fc78a55519254d99869742dd33c)
1 /*
2     Implementation of std.regex IR, an intermediate representation
3     of a regular expression pattern.
4 
5     This is a common ground between frontend regex component (parser)
6     and backend components - generators, matchers and other "filters".
7 */
8 module std.regex.internal.ir;
9 
10 package(std.regex):
11 
12 import std.exception, std.meta, std.range.primitives, std.traits, std.uni;
13 
14 debug(std_regex_parser) import std.stdio;
15 // just a common trait, may be moved elsewhere
16 alias BasicElementOf(Range) = Unqual!(ElementEncodingType!Range);
17 
18 enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD';
19 
20 // heuristic value determines maximum CodepointSet length suitable for linear search
21 enum maxCharsetUsed = 6;
22 
23 // another variable to tweak behavior of caching generated Tries for character classes
24 enum maxCachedMatchers = 8;
25 
26 alias Trie = CodepointSetTrie!(13, 8);
27 alias makeTrie = codepointSetTrie!(13, 8);
28 
29 CharMatcher[CodepointSet] matcherCache;
30 
31 //accessor with caching
getMatcher(CodepointSet set)32 @trusted CharMatcher getMatcher(CodepointSet set)
33 {
34     // almost all properties of AA are not @safe
35     // https://issues.dlang.org/show_bug.cgi?id=6357
36     if (__ctfe || maxCachedMatchers == 0)
37         return CharMatcher(set);
38     else
39     {
40         auto p = set in matcherCache;
41         if (p)
42             return *p;
43         if (matcherCache.length == maxCachedMatchers)
44         {
45             // flush enmatchers in trieCache
46             matcherCache = null;
47         }
48         return (matcherCache[set] = CharMatcher(set));
49     }
50 }
51 
wordMatcher()52 @property ref wordMatcher()()
53 {
54     static immutable CharMatcher matcher = CharMatcher(wordCharacter);
55     return matcher;
56 }
57 
58 // some special Unicode white space characters
59 private enum NEL = '\u0085', LS = '\u2028', PS = '\u2029';
60 
61 //Regular expression engine/parser options:
62 // global - search  all nonoverlapping matches in input
63 // casefold - case insensitive matching, do casefolding on match in unicode mode
64 // freeform - ignore whitespace in pattern, to match space use [ ] or \s
65 // multiline - switch  ^, $ detect start and end of linesinstead of just start and end of input
66 enum RegexOption: uint {
67     global = 0x1,
68     casefold = 0x2,
69     freeform = 0x4,
70     nonunicode = 0x8,
71     multiline = 0x10,
72     singleline = 0x20
73 }
74 //do not reorder this list
75 alias RegexOptionNames = AliasSeq!('g', 'i', 'x', 'U', 'm', 's');
76 static assert( RegexOption.max < 0x80);
77 
package(std)78 package(std) string regexOptionsToString()(uint flags) nothrow pure @safe
79 {
80     flags &= (RegexOption.max << 1) - 1;
81     if (!flags)
82         return "";
83     char[RegexOptionNames.length] buffer = void;
84     size_t pos = 0;
85     foreach (i, flag; __traits(allMembers, RegexOption))
86         if (flags & __traits(getMember, RegexOption, flag))
87             buffer[pos++] = RegexOptionNames[i];
88     return buffer[0 .. pos].idup;
89 }
90 
91 // flags that allow guide execution of engine
92 enum RegexInfo : uint { oneShot = 0x80 }
93 
94 // IR bit pattern: 0b1_xxxxx_yy
95 // where yy indicates class of instruction, xxxxx for actual operation code
96 //     00: atom, a normal instruction
97 //     01: open, opening of a group, has length of contained IR in the low bits
98 //     10: close, closing of a group, has length of contained IR in the low bits
99 //     11 unused
100 //
101 // Loops with Q (non-greedy, with ? mark) must have the same size / other properties as non Q version
102 // Possible changes:
103 //* merge group, option, infinite/repeat start (to never copy during parsing of (a|b){1,2})
104 //* reorganize groups to make n args easier to find, or simplify the check for groups of similar ops
105 //  (like lookaround), or make it easier to identify hotspots.
106 
107 enum IR:uint {
108     Char              = 0b1_00000_00, //a character
109     Any               = 0b1_00001_00, //any character
110     CodepointSet      = 0b1_00010_00, //a most generic CodepointSet [...]
111     Trie              = 0b1_00011_00, //CodepointSet implemented as Trie
112     //match with any of a consecutive OrChar's in this sequence
113     //(used for case insensitive match)
114     //OrChar holds in upper two bits of data total number of OrChars in this _sequence_
115     //the drawback of this representation is that it is difficult
116     // to detect a jump in the middle of it
117     OrChar             = 0b1_00100_00,
118     Nop                = 0b1_00101_00, //no operation (padding)
119     End                = 0b1_00110_00, //end of program
120     Bol                = 0b1_00111_00, //beginning of a line ^
121     Eol                = 0b1_01000_00, //end of a line $
122     Wordboundary       = 0b1_01001_00, //boundary of a word
123     Notwordboundary    = 0b1_01010_00, //not a word boundary
124     Backref            = 0b1_01011_00, //backreference to a group (that has to be pinned, i.e. locally unique) (group index)
125     GroupStart         = 0b1_01100_00, //start of a group (x) (groupIndex+groupPinning(1bit))
126     GroupEnd           = 0b1_01101_00, //end of a group (x) (groupIndex+groupPinning(1bit))
127     Option             = 0b1_01110_00, //start of an option within an alternation x | y (length)
128     GotoEndOr          = 0b1_01111_00, //end of an option (length of the rest)
129     Bof                = 0b1_10000_00, //begining of "file" (string) ^
130     Eof                = 0b1_10001_00, //end of "file" (string) $
131     //... any additional atoms here
132 
133     OrStart            = 0b1_00000_01, //start of alternation group  (length)
134     OrEnd              = 0b1_00000_10, //end of the or group (length,mergeIndex)
135     //with this instruction order
136     //bit mask 0b1_00001_00 could be used to test/set greediness
137     InfiniteStart      = 0b1_00001_01, //start of an infinite repetition x* (length)
138     InfiniteEnd        = 0b1_00001_10, //end of infinite repetition x* (length,mergeIndex)
139     InfiniteQStart     = 0b1_00010_01, //start of a non eager infinite repetition x*? (length)
140     InfiniteQEnd       = 0b1_00010_10, //end of non eager infinite repetition x*? (length,mergeIndex)
141     InfiniteBloomStart = 0b1_00011_01, //start of an filtered infinite repetition x* (length)
142     InfiniteBloomEnd   = 0b1_00011_10, //end of filtered infinite repetition x* (length,mergeIndex)
143     RepeatStart        = 0b1_00100_01, //start of a {n,m} repetition (length)
144     RepeatEnd          = 0b1_00100_10, //end of x{n,m} repetition (length,step,minRep,maxRep)
145     RepeatQStart       = 0b1_00101_01, //start of a non eager x{n,m}? repetition (length)
146     RepeatQEnd         = 0b1_00101_10, //end of non eager x{n,m}? repetition (length,step,minRep,maxRep)
147 
148     //
149     LookaheadStart     = 0b1_00110_01, //begin of the lookahead group (length)
150     LookaheadEnd       = 0b1_00110_10, //end of a lookahead group (length)
151     NeglookaheadStart  = 0b1_00111_01, //start of a negative lookahead (length)
152     NeglookaheadEnd    = 0b1_00111_10, //end of a negative lookahead (length)
153     LookbehindStart    = 0b1_01000_01, //start of a lookbehind (length)
154     LookbehindEnd      = 0b1_01000_10, //end of a lookbehind (length)
155     NeglookbehindStart = 0b1_01001_01, //start of a negative lookbehind (length)
156     NeglookbehindEnd   = 0b1_01001_10, //end of negative lookbehind (length)
157 }
158 
159 //a shorthand for IR length - full length of specific opcode evaluated at compile time
IRL(IR code)160 template IRL(IR code)
161 {
162     enum uint IRL =  lengthOfIR(code);
163 }
164 static assert(IRL!(IR.LookaheadStart) == 3);
165 
166 //how many parameters follow the IR, should be optimized fixing some IR bits
immediateParamsIR(IR i)167 int immediateParamsIR(IR i) @safe pure nothrow @nogc
168 {
169     switch (i)
170     {
171     case IR.OrEnd,IR.InfiniteEnd,IR.InfiniteQEnd:
172         return 1;  // merge table index
173     case IR.InfiniteBloomEnd:
174         return 2;  // bloom filter index + merge table index
175     case IR.RepeatEnd, IR.RepeatQEnd:
176         return 4;
177     case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
178         return 2;  // start-end of captures used
179     default:
180         return 0;
181     }
182 }
183 
184 //full length of IR instruction inlcuding all parameters that might follow it
lengthOfIR(IR i)185 int lengthOfIR(IR i) @safe pure nothrow @nogc
186 {
187     return 1 + immediateParamsIR(i);
188 }
189 
190 //full length of the paired IR instruction inlcuding all parameters that might follow it
lengthOfPairedIR(IR i)191 int lengthOfPairedIR(IR i) @safe pure nothrow @nogc
192 {
193     return 1 + immediateParamsIR(pairedIR(i));
194 }
195 
196 //if the operation has a merge point (this relies on the order of the ops)
hasMerge(IR i)197 bool hasMerge(IR i) @safe pure nothrow @nogc
198 {
199     return (i&0b11)==0b10 && i <= IR.RepeatQEnd;
200 }
201 
202 //is an IR that opens a "group"
isStartIR(IR i)203 bool isStartIR(IR i) @safe pure nothrow @nogc
204 {
205     return (i&0b11)==0b01;
206 }
207 
208 //is an IR that ends a "group"
isEndIR(IR i)209 bool isEndIR(IR i) @safe pure nothrow @nogc
210 {
211     return (i&0b11)==0b10;
212 }
213 
214 //is a standalone IR
isAtomIR(IR i)215 bool isAtomIR(IR i) @safe pure nothrow @nogc
216 {
217     return (i&0b11)==0b00;
218 }
219 
220 //makes respective pair out of IR i, swapping start/end bits of instruction
pairedIR(IR i)221 IR pairedIR(IR i) @safe pure nothrow @nogc
222 {
223     assert(isStartIR(i) || isEndIR(i));
224     return cast(IR) (i ^ 0b11);
225 }
226 
227 //encoded IR instruction
228 @safe pure
229 struct Bytecode
230 {
231     uint raw;
232     //natural constraints
233     enum maxSequence = 2+4;
234     enum maxData = 1 << 22;
235     enum maxRaw = 1 << 31;
236 
237 @safe pure:
thisBytecode238     this(IR code, uint data)
239     {
240         assert(data < (1 << 22) && code < 256);
241         raw = code << 24 | data;
242     }
243 
thisBytecode244     this(IR code, uint data, uint seq)
245     {
246         assert(data < (1 << 22) && code < 256 );
247         assert(seq >= 2 && seq < maxSequence);
248         raw = code << 24 | (seq - 2)<<22 | data;
249     }
250 
251     //store raw data
fromRawBytecode252     static Bytecode fromRaw(uint data)
253     {
254         Bytecode t;
255         t.raw = data;
256         return t;
257     }
258 
259     // bit twiddling helpers
260     // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
dataBytecode261     @property uint data()() const { return raw & 0x003f_ffff; }
262 
dataBytecode263     @property void data()(uint val)
264     {
265         raw = (raw & ~0x003f_ffff) | (val & 0x003f_ffff);
266     }
267 
268     // ditto
269     // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
sequenceBytecode270     @property uint sequence()() const { return 2 + (raw >> 22 & 0x3); }
271 
272     // ditto
273     // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
codeBytecode274     @property IR code()() const { return cast(IR)(raw >> 24); }
275 
276     //ditto
hotspotBytecode277     @property bool hotspot() const { return hasMerge(code); }
278 
279     //test the class of this instruction
isAtomBytecode280     @property bool isAtom() const { return isAtomIR(code); }
281 
282     //ditto
isStartBytecode283     @property bool isStart() const { return isStartIR(code); }
284 
285     //ditto
isEndBytecode286     @property bool isEnd() const { return isEndIR(code); }
287 
288     //number of arguments for this instruction
argsBytecode289     @property int args() const { return immediateParamsIR(code); }
290 
291     //mark this GroupStart or GroupEnd as referenced in backreference
setBackrefenceBytecode292     void setBackrefence()
293     {
294         assert(code == IR.GroupStart || code == IR.GroupEnd);
295         raw = raw | 1 << 23;
296     }
297 
298     //is referenced
backreferenceBytecode299     @property bool backreference() const
300     {
301         assert(code == IR.GroupStart || code == IR.GroupEnd);
302         return cast(bool)(raw & 1 << 23);
303     }
304 
305     //mark as local reference (for backrefs in lookarounds)
setLocalRefBytecode306     void setLocalRef()
307     {
308         assert(code == IR.Backref);
309         raw = raw | 1 << 23;
310     }
311 
312     //is a local ref
localRefBytecode313     @property bool localRef() const
314     {
315         assert(code == IR.Backref);
316         return cast(bool)(raw & 1 << 23);
317     }
318 
319     //human readable name of instruction
mnemonicBytecode320     @trusted @property string mnemonic()() const
321     {//@@@BUG@@@ to is @system
322         import std.conv : to;
323         return to!string(code);
324     }
325 
326     //full length of instruction
lengthBytecode327     @property uint length() const
328     {
329         return lengthOfIR(code);
330     }
331 
332     //full length of respective start/end of this instruction
pairedLengthBytecode333     @property uint pairedLength() const
334     {
335         return lengthOfPairedIR(code);
336     }
337 
338     //returns bytecode of paired instruction (assuming this one is start or end)
pairedBytecode339     @property Bytecode paired() const
340     {//depends on bit and struct layout order
341         assert(isStart || isEnd);
342         return Bytecode.fromRaw(raw ^ 0b11 << 24);
343     }
344 
345     //gets an index into IR block of the respective pair
indexOfPairBytecode346     uint indexOfPair(uint pc) const
347     {
348         assert(isStart || isEnd);
349         return isStart ? pc + data + length  : pc - data - lengthOfPairedIR(code);
350     }
351 }
352 
353 static assert(Bytecode.sizeof == 4);
354 
355 
356 //index entry structure for name --> number of submatch
357 struct NamedGroup
358 {
359     string name;
360     uint group;
361 }
362 
363 //holds pair of start-end markers for a submatch
Group(DataIndex)364 struct Group(DataIndex)
365 {
366     DataIndex begin = DataIndex.max;
367     DataIndex end   = DataIndex.min;
368 
369     bool opCast(T : bool)() const
370     {
371         return begin <= end;
372     }
373 
374     @trusted string toString()() const
375     {
376         if (begin < end)
377             return "(unmatched)";
378         import std.array : appender;
379         import std.format.write : formattedWrite;
380         auto a = appender!string();
381         formattedWrite(a, "%s..%s", begin, end);
382         return a.data;
383     }
384 }
385 
386 //debugging tool, prints out instruction along with opcodes
387 @trusted string disassemble(in Bytecode[] irb, uint pc, in NamedGroup[] dict=[])
388 {
389     import std.array : appender;
390     import std.format.write : formattedWrite;
391     auto output = appender!string();
392     formattedWrite(output,"%s", irb[pc].mnemonic);
393     switch (irb[pc].code)
394     {
395     case IR.Char:
396         formattedWrite(output, " %s (0x%x)",cast(dchar) irb[pc].data, irb[pc].data);
397         break;
398     case IR.OrChar:
399         formattedWrite(output, " %s (0x%x) seq=%d", cast(dchar) irb[pc].data, irb[pc].data, irb[pc].sequence);
400         break;
401     case IR.RepeatStart, IR.InfiniteStart, IR.InfiniteBloomStart,
402     IR.Option, IR.GotoEndOr, IR.OrStart:
403         //forward-jump instructions
404         uint len = irb[pc].data;
405         formattedWrite(output, " pc=>%u", pc+len+IRL!(IR.RepeatStart));
406         break;
407     case IR.RepeatEnd, IR.RepeatQEnd: //backward-jump instructions
408         uint len = irb[pc].data;
409         formattedWrite(output, " pc=>%u min=%u max=%u step=%u",
410             pc - len, irb[pc + 3].raw, irb[pc + 4].raw, irb[pc + 2].raw);
411         break;
412     case IR.InfiniteEnd, IR.InfiniteQEnd, IR.InfiniteBloomEnd, IR.OrEnd: //ditto
413         uint len = irb[pc].data;
414         formattedWrite(output, " pc=>%u", pc-len);
415         break;
416     case  IR.LookaheadEnd, IR.NeglookaheadEnd: //ditto
417         uint len = irb[pc].data;
418         formattedWrite(output, " pc=>%u", pc-len);
419         break;
420     case IR.GroupStart, IR.GroupEnd:
421         uint n = irb[pc].data;
422         string name;
423         foreach (v;dict)
424             if (v.group == n)
425             {
426                 name = "'"~v.name~"'";
427                 break;
428             }
429         formattedWrite(output, " %s #%u " ~ (irb[pc].backreference ? "referenced" : ""),
430                 name, n);
431         break;
432     case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
433         uint len = irb[pc].data;
434         uint start = irb[pc+1].raw, end = irb[pc+2].raw;
435         formattedWrite(output, " pc=>%u [%u..%u]", pc + len + IRL!(IR.LookaheadStart), start, end);
436         break;
437     case IR.Backref: case IR.CodepointSet: case IR.Trie:
438         uint n = irb[pc].data;
439         formattedWrite(output, " %u",  n);
440         if (irb[pc].code == IR.Backref)
441             formattedWrite(output, " %s", irb[pc].localRef ? "local" : "global");
442         break;
443     default://all data-free instructions
444     }
445     if (irb[pc].hotspot)
446         formattedWrite(output, " Hotspot %u", irb[pc+1].raw);
447     return output.data;
448 }
449 
450 //disassemble the whole chunk
printBytecode()451 @trusted void printBytecode()(in Bytecode[] slice, in NamedGroup[] dict=[])
452 {
453     import std.stdio : writeln;
454     for (uint pc=0; pc<slice.length; pc += slice[pc].length)
455         writeln("\t", disassemble(slice, pc, dict));
456 }
457 
458 // Encapsulates memory management, explicit ref counting
459 // and the exact type of engine created
460 // there is a single instance per engine combination type x Char
461 // In future may also maintain a (TLS?) cache of memory
MatcherFactory(Char)462 interface MatcherFactory(Char)
463 {
464 @safe:
465     Matcher!Char create(const ref Regex!Char, in Char[] input) const;
466     Matcher!Char dup(Matcher!Char m, in Char[] input) const;
467     size_t incRef(Matcher!Char m) const;
468     size_t decRef(Matcher!Char m) const;
469 }
470 
471 // Only memory management, no compile-time vs run-time specialities
472 abstract class GenericFactory(alias EngineType, Char) : MatcherFactory!Char
473 {
474     import core.memory : pureFree;
475     import std.internal.memory : enforceMalloc;
476     import core.memory : GC;
477     // round up to next multiple of size_t for alignment purposes
478     enum classSize = (__traits(classInstanceSize, EngineType!Char) + size_t.sizeof - 1) & ~(size_t.sizeof - 1);
479 
480     EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const;
481 
482     override EngineType!Char create(const ref Regex!Char re, in Char[] input) const @trusted
483     {
484         immutable size = EngineType!Char.initialMemory(re) + classSize;
485         auto memory = enforceMalloc(size)[0 .. size];
486         scope(failure) pureFree(memory.ptr);
487         GC.addRange(memory.ptr, classSize);
488         auto engine = construct(re, input, memory);
489         assert(engine.refCount == 1);
490         assert(cast(void*) engine == memory.ptr);
491         return engine;
492     }
493 
494     override EngineType!Char dup(Matcher!Char engine, in Char[] input) const @trusted
495     {
496         immutable size = EngineType!Char.initialMemory(engine.pattern) + classSize;
497         auto memory = enforceMalloc(size)[0 .. size];
498         scope(failure) pureFree(memory.ptr);
499         auto copy = construct(engine.pattern, input, memory);
500         GC.addRange(memory.ptr, classSize);
501         engine.dupTo(copy, memory[classSize .. size]);
502         assert(copy.refCount == 1);
503         return copy;
504     }
505 
506     override size_t incRef(Matcher!Char m) const
507     {
508         return ++m.refCount;
509     }
510 
511     override size_t decRef(Matcher!Char m) const  @trusted
512     {
513         assert(m.refCount != 0);
514         auto cnt = --m.refCount;
515         if (cnt == 0)
516         {
517             void* ptr = cast(void*) m;
518             GC.removeRange(ptr);
519             pureFree(ptr);
520         }
521         return cnt;
522     }
523 }
524 
525 // A factory for run-time engines
526 class RuntimeFactory(alias EngineType, Char) : GenericFactory!(EngineType, Char)
527 {
528     override EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const
529     {
530         import core.lifetime : emplace;
531         return emplace!(EngineType!Char)(memory[0 .. classSize],
532             re, Input!Char(input), memory[classSize .. $]);
533     }
534 }
535 
536 // A factory for compile-time engine
537 class CtfeFactory(alias EngineType, Char, alias func) : GenericFactory!(EngineType, Char)
538 {
539     override EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const
540     {
541         import core.lifetime : emplace;
542         return emplace!(EngineType!Char)(memory[0 .. classSize],
543             re, &func, Input!Char(input), memory[classSize .. $]);
544     }
545 }
546 
defaultFactoryImpl(Char)547 private auto defaultFactoryImpl(Char)(const ref Regex!Char re)
548 {
549     import std.regex.internal.backtracking : BacktrackingMatcher;
550     import std.regex.internal.thompson : ThompsonMatcher;
551     import std.algorithm.searching : canFind;
552     static MatcherFactory!Char backtrackingFactory;
553     static MatcherFactory!Char thompsonFactory;
554     if (re.backrefed.canFind!"a != 0")
555     {
556         if (backtrackingFactory is null)
557             backtrackingFactory = new RuntimeFactory!(BacktrackingMatcher, Char);
558         return backtrackingFactory;
559     }
560     else
561     {
562         if (thompsonFactory is null)
563             thompsonFactory = new RuntimeFactory!(ThompsonMatcher, Char);
564         return thompsonFactory;
565     }
566 }
567 
568 // Used to generate a pure wrapper for defaultFactoryImpl. Based on the example in
569 // the std.traits.SetFunctionAttributes documentation.
570 auto assumePureFunction(T)(T t)
571 if (isFunctionPointer!T)
572 {
573     enum attrs = functionAttributes!T | FunctionAttribute.pure_;
574     return cast(SetFunctionAttributes!(T, functionLinkage!T, attrs)) t;
575 }
576 
577 // A workaround for R-T enum re = regex(...)
defaultFactory(Char)578 template defaultFactory(Char)
579 {
580     // defaultFactory is constructed as a safe, pure wrapper over defaultFactoryImpl.
581     // It can be faked as pure because the static mutable variables are used to cache
582     // the key and character matcher. The technique used avoids delegates and GC.
583     @property MatcherFactory!Char defaultFactory(const ref Regex!Char re) @safe pure
584     {
585         static auto impl(const ref Regex!Char re)
586         {
587             return defaultFactoryImpl(re);
588         }
589 
590         static auto pureImpl(const ref Regex!Char re) @trusted
591         {
592             auto p = assumePureFunction(&impl);
593             return p(re);
594         }
595 
596         return pureImpl(re);
597     }
598 }
599 
600 // Defining it as an interface has the undesired side-effect:
601 // casting any class to an interface silently adjusts pointer to point to a nested vtbl
Matcher(Char)602 abstract class Matcher(Char)
603 {
604 abstract:
605     // Get a (next) match
606     int match(Group!size_t[] matches) pure;
607     // This only maintains internal ref-count,
608     // deallocation happens inside MatcherFactory
609     @property ref size_t refCount() @safe;
610     // Copy internal state to another engine, using memory arena 'memory'
611     void dupTo(Matcher!Char m, void[] memory);
612     // The pattern loaded
613     @property ref const(Regex!Char) pattern() @safe;
614     // Re-arm the engine with new Input
615     Matcher rearm(in Char[] stream);
616 }
617 
618 /++
619     `Regex` object holds regular expression pattern in compiled form.
620     Instances of this object are constructed via calls to `regex`.
621     This is an intended form for caching and storage of frequently
622     used regular expressions.
623 +/
Regex(Char)624 struct Regex(Char)
625 {
626     //temporary workaround for identifier lookup
627     CodepointSet[] charsets; //
628     Bytecode[] ir;      //compiled bytecode of pattern
629 
630 
631     @safe @property bool empty() const nothrow {  return ir is null; }
632     /++
633     `namedCaptures` returns a range of all named captures in a given regular expression.
634     +/
635     @safe @property auto namedCaptures()
636     {
637         static struct NamedGroupRange
638         {
639         private:
640             const(NamedGroup)[] groups;
641             size_t start;
642             size_t end;
643         public:
644             this(const(NamedGroup)[] g, size_t s, size_t e)
645             {
646                 assert(s <= e);
647                 assert(e <= g.length);
648                 groups = g;
649                 start = s;
650                 end = e;
651             }
652 
653             @property string front() { return groups[start].name; }
654             @property string back() { return groups[end-1].name; }
655             @property bool empty() { return start >= end; }
656             @property size_t length() { return end - start; }
657             alias opDollar = length;
658             @property NamedGroupRange save()
659             {
660                 return NamedGroupRange(groups, start, end);
661             }
662             void popFront() { assert(!empty); start++; }
663             void popBack() { assert(!empty); end--; }
664             string opIndex()(size_t i)
665             {
666                 assert(start + i < end,
667                        "Requested named group is out of range.");
668                 return groups[start+i].name;
669             }
670             NamedGroupRange opSlice(size_t low, size_t high) {
671                 assert(low <= high);
672                 assert(start + high <= end);
673                 return NamedGroupRange(groups, start + low, start + high);
674             }
675             NamedGroupRange opSlice() { return this.save; }
676         }
677         return NamedGroupRange(dict, 0, dict.length);
678     }
679 
680 package(std.regex):
681     import std.regex.internal.kickstart : Kickstart; //TODO: get rid of this dependency
682     const(NamedGroup)[] dict;              // maps name -> user group number
683     uint ngroup;                           // number of internal groups
684     uint maxCounterDepth;                  // max depth of nested {n,m} repetitions
685     uint hotspotTableSize;                 // number of entries in merge table
686     uint threadCount;                      // upper bound on number of Thompson VM threads
687     uint flags;                            // global regex flags
688     public const(CharMatcher)[]  matchers; // tables that represent character sets
689     public const(BitTable)[] filters;      // bloom filters for conditional loops
690     uint[] backrefed;                      // bit array of backreferenced submatches
691     Kickstart!Char kickstart;
692     MatcherFactory!Char factory;           // produces optimal matcher for this pattern
693     immutable(Char)[] pattern;             // copy of pattern to serve as cache key
694 
695     const(Regex) withFactory(MatcherFactory!Char factory) pure const @trusted
696     {
697         auto r = cast() this;
698         r.factory = factory;
699         return r;
700     }
701 
702     const(Regex) withFlags(uint newFlags) pure const @trusted
703     {
704         auto r = cast() this;
705         r.flags = newFlags;
706         return r;
707     }
708 
709     const(Regex) withCode(const(Bytecode)[] code) pure const @trusted
710     {
711         auto r = cast() this;
712         r.ir = code.dup; // TODO: sidestep const instead?
713         return r;
714     }
715 
716     const(Regex) withNGroup(uint nGroup) pure const @trusted
717     {
718         auto r = cast() this;
719         r.ngroup = nGroup;
720         return r;
721     }
722 
723     //bit access helper
724     uint isBackref(uint n)
725     {
726         if (n/32 >= backrefed.length)
727             return 0;
728         return backrefed[n / 32] & (1 << (n & 31));
729     }
730 
731     //check if searching is not needed
732     void checkIfOneShot()
733     {
734     L_CheckLoop:
735         for (uint i = 0; i < ir.length; i += ir[i].length)
736         {
737             switch (ir[i].code)
738             {
739                 case IR.Bof:
740                     flags |= RegexInfo.oneShot;
741                     break L_CheckLoop;
742                 case IR.GroupStart, IR.GroupEnd, IR.Bol, IR.Eol, IR.Eof,
743                 IR.Wordboundary, IR.Notwordboundary:
744                     break;
745                 default:
746                     break L_CheckLoop;
747             }
748         }
749     }
750 
751     //print out disassembly a program's IR
752     @trusted debug(std_regex_parser) void print() const
753     {//@@@BUG@@@ write is system
754         for (uint i = 0; i < ir.length; i += ir[i].length)
755         {
756             writefln("%d\t%s ", i, disassemble(ir, i, dict));
757         }
758         writeln("Total merge table size: ", hotspotTableSize);
759         writeln("Max counter nesting depth: ", maxCounterDepth);
760     }
761 
762     public string toString()() const
763     {
764         import std.format : format;
765         static if (is(typeof(pattern) : string))
766             alias patternString = pattern;
767         else
768         {
769             import std.conv : to;
770             auto patternString = conv.to!string(pattern);
771         }
772         auto quotedEscapedPattern = format("%(%s %)", [patternString]);
773         auto flagString = regexOptionsToString(flags);
774         return "Regex!" ~ Char.stringof ~ "(" ~ quotedEscapedPattern ~ ", \"" ~ flagString ~ "\")";
775     }
776 }
777 
778 // The stuff below this point is temporarrily part of IR module
779 // but may need better place in the future (all internals)
780 package(std.regex):
781 
782 //Simple UTF-string abstraction compatible with stream interface
783 struct Input(Char)
784 if (is(Char :dchar))
785 {
786     import std.utf : decode;
787     alias DataIndex = size_t;
788     enum bool isLoopback = false;
789     alias String = const(Char)[];
790     String _origin;
791     size_t _index;
792 
793     //constructs Input object out of plain string
794     this(String input, size_t idx = 0)
795     {
796         _origin = input;
797         _index = idx;
798     }
799 
800     //codepoint at current stream position
pragma(inline,true)801     pragma(inline, true) bool nextChar(ref dchar res, ref size_t pos)
802     {
803         pos = _index;
804         // DMD's inliner hates multiple return functions
805         // but can live with single statement if/else bodies
806         bool n = !(_index == _origin.length);
807         if (n)
808             res = decode(_origin, _index);
809         return n;
810     }
atEnd()811     @property bool atEnd(){
812         return _index == _origin.length;
813     }
search(Kickstart)814     bool search(Kickstart)(ref const Kickstart kick, ref dchar res, ref size_t pos)
815     {
816         size_t idx = kick.search(_origin, _index);
817         _index = idx;
818         return nextChar(res, pos);
819     }
820 
821     //index of at End position
lastIndex()822     @property size_t lastIndex(){   return _origin.length; }
823 
824     //support for backtracker engine, might not be present
reset(size_t index)825     void reset(size_t index){   _index = index;  }
826 
opSlice(size_t start,size_t end)827     String opSlice(size_t start, size_t end){   return _origin[start .. end]; }
828 
loopBack(size_t index)829     auto loopBack(size_t index){   return BackLooper!Input(this, index); }
830 }
831 
BackLooperImpl(Input)832 struct BackLooperImpl(Input)
833 {
834     import std.utf : strideBack;
835     alias DataIndex = size_t;
836     alias String = Input.String;
837     enum bool isLoopback = true;
838     String _origin;
839     size_t _index;
840     this(Input input, size_t index)
841     {
842         _origin = input._origin;
843         _index = index;
844     }
845     this(String input)
846     {
847         _origin = input;
848         _index = input.length;
849     }
850     @trusted bool nextChar(ref dchar res,ref size_t pos)
851     {
852         pos = _index;
853         if (_index == 0)
854             return false;
855 
856         res = _origin[0.._index].back;
857         _index -= strideBack(_origin, _index);
858 
859         return true;
860     }
861     @property atEnd(){ return _index == 0 || _index == strideBack(_origin, _index); }
862     auto loopBack(size_t index){   return Input(_origin, index); }
863 
864     //support for backtracker engine, might not be present
865     //void reset(size_t index){   _index = index ? index-std.utf.strideBack(_origin, index) : 0;  }
866     void reset(size_t index){   _index = index;  }
867 
868     String opSlice(size_t start, size_t end){   return _origin[end .. start]; }
869     //index of at End position
870     @property size_t lastIndex(){   return 0; }
871 }
872 
BackLooper(E)873 template BackLooper(E)
874 {
875     static if (is(E : BackLooperImpl!U, U))
876     {
877         alias BackLooper = U;
878     }
879     else
880     {
881         alias BackLooper = BackLooperImpl!E;
882     }
883 }
884 
885 //both helpers below are internal, on its own are quite "explosive"
886 //unsafe, no initialization of elements
mallocArray(T)887 @system pure T[] mallocArray(T)(size_t len)
888 {
889     import core.memory : pureMalloc;
890     return (cast(T*) pureMalloc(len * T.sizeof))[0 .. len];
891 }
892 
893 //very unsafe, no initialization
arrayInChunk(T)894 @system T[] arrayInChunk(T)(size_t len, ref void[] chunk)
895 {
896     auto ret = (cast(T*) chunk.ptr)[0 .. len];
897     chunk = chunk[len * T.sizeof .. $];
898     return ret;
899 }
900 
901 //
lookupNamedGroup(String)902 @trusted uint lookupNamedGroup(String)(const(NamedGroup)[] dict, String name)
903 {//equal is @system?
904     import std.algorithm.comparison : equal;
905     import std.algorithm.iteration : map;
906     import std.conv : text;
907     import std.range : assumeSorted;
908 
909     auto fnd = assumeSorted!"cmp(a,b) < 0"(map!"a.name"(dict)).lowerBound(name).length;
910     enforce(fnd < dict.length && equal(dict[fnd].name, name),
911         text("no submatch named ", name));
912     return dict[fnd].group;
913 }
914 
915 // whether ch is one of unicode newline sequences
916 // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
endOfLine()917 bool endOfLine()(dchar front, bool seenCr)
918 {
919     return ((front == '\n') ^ seenCr) || front == '\r'
920     || front == NEL || front == LS || front == PS;
921 }
922 
923 // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
startOfLine()924 bool startOfLine()(dchar back, bool seenNl)
925 {
926     return ((back == '\r') ^ seenNl) || back == '\n'
927     || back == NEL || back == LS || back == PS;
928 }
929 
930 ///Exception object thrown in case of errors during regex compilation.
931 public class RegexException : Exception
932 {
933     mixin basicExceptionCtors;
934 }
935 
936 // simple 128-entry bit-table used with a hash function
937 struct BitTable {
938     uint[4] filter;
939 
thisBitTable940     this(CodepointSet set){
941         foreach (iv; set.byInterval)
942         {
943             foreach (v; iv.a .. iv.b)
944                 add(v);
945         }
946     }
947 
addBitTable948     void add()(dchar ch){
949         immutable i = index(ch);
950         filter[i >> 5]  |=  1<<(i & 31);
951     }
952     // non-zero -> might be present, 0 -> absent
opIndexBitTable953     bool opIndex()(dchar ch) const{
954         immutable i = index(ch);
955         return (filter[i >> 5]>>(i & 31)) & 1;
956     }
957 
indexBitTable958     static uint index()(dchar ch){
959         return ((ch >> 7) ^ ch) & 0x7F;
960     }
961 }
962 
963 struct CharMatcher {
964     BitTable ascii; // fast path for ASCII
965     Trie trie;      // slow path for Unicode
966 
this(CodepointSet set)967     this(CodepointSet set)
968     {
969         auto asciiSet = set & unicode.ASCII;
970         ascii = BitTable(asciiSet);
971         trie = makeTrie(set);
972     }
973 
opIndex()974     bool opIndex()(dchar ch) const
975     {
976         if (ch < 0x80)
977             return ascii[ch];
978         else
979             return trie[ch];
980     }
981 }
982 
983 // Internal non-resizeble array, switches between inline storage and CoW
984 // POD-only
985 struct SmallFixedArray(T, uint SMALL=3)
986 if (!hasElaborateDestructor!T)
987 {
988     import std.internal.memory : enforceMalloc;
989     import core.memory : pureFree;
990     static struct Payload
991     {
992         size_t refcount;
993         T[0] placeholder;
inoutSmallFixedArray::Payload994         inout(T)* ptr() inout { return placeholder.ptr; }
995     }
996     static assert(Payload.sizeof == size_t.sizeof);
997     union
998     {
999         Payload* big;
1000         T[SMALL] small;
1001     }
1002     size_t _sizeMask;
1003     enum BIG_MASK = size_t(1)<<(8*size_t.sizeof-1);
1004     enum SIZE_MASK = ~BIG_MASK;
1005 
isBigSmallFixedArray1006     @property bool isBig() const { return (_sizeMask & BIG_MASK) != 0; }
lengthSmallFixedArray1007     @property size_t length() const { return _sizeMask & SIZE_MASK; }
1008 
thisSmallFixedArray1009     this(size_t size)
1010     {
1011         if (size <= SMALL)
1012         {
1013             small[] = T.init;
1014             _sizeMask = size;
1015         }
1016         else
1017         {
1018             big = cast(Payload*) enforceMalloc(Payload.sizeof + T.sizeof*size);
1019             big.refcount = 1;
1020             _sizeMask = size | BIG_MASK;
1021         }
1022     }
1023 
inoutSmallFixedArray1024     private @trusted @property inout(T)[] internalSlice() inout
1025     {
1026         return isBig ? big.ptr[0 .. length] : small[0 .. length];
1027     }
1028 
thisSmallFixedArray1029     this(this)
1030     {
1031         if (isBig)
1032         {
1033             big.refcount++;
1034         }
1035     }
1036 
opEqualsSmallFixedArray1037     bool opEquals(SmallFixedArray a)
1038     {
1039         return internalSlice[] == a.internalSlice[];
1040     }
1041 
toHashSmallFixedArray1042     size_t toHash() const
1043     {
1044         return hashOf(internalSlice[]);
1045     }
1046 
inoutSmallFixedArray1047     ref inout(T) opIndex(size_t idx) inout
1048     {
1049         return internalSlice[idx];
1050     }
1051 
1052     // accesses big to test self-referencing so not @safe
opAssignSmallFixedArray1053     @trusted ref opAssign(SmallFixedArray arr)
1054     {
1055         if (isBig)
1056         {
1057             if (arr.isBig)
1058             {
1059                 if (big is arr.big) return this; // self-assign
1060                 else
1061                 {
1062                     abandonRef();
1063                     _sizeMask = arr._sizeMask;
1064                     big = arr.big;
1065                     big.refcount++;
1066                 }
1067             }
1068             else
1069             {
1070                 abandonRef();
1071                 _sizeMask = arr._sizeMask;
1072                 small = arr.small;
1073             }
1074         }
1075         else
1076         {
1077             if (arr.isBig)
1078             {
1079                 _sizeMask = arr._sizeMask;
1080                 big = arr.big;
1081                 big.refcount++;
1082             }
1083             else
1084             {
1085                 _sizeMask = arr._sizeMask;
1086                 small = arr.small;
1087             }
1088         }
1089         return this;
1090     }
1091 
mutateSmallFixedArray1092     void mutate(scope void delegate(T[]) pure filler)
1093     {
1094         if (isBig && big.refcount != 1) // copy on write
1095         {
1096             auto oldSizeMask = _sizeMask;
1097             auto newbig = cast(Payload*) enforceMalloc(Payload.sizeof + T.sizeof*length);
1098             newbig.refcount = 1;
1099             abandonRef();
1100             big = newbig;
1101             _sizeMask = oldSizeMask;
1102         }
1103         filler(internalSlice);
1104     }
1105 
~thisSmallFixedArray1106     ~this()
1107     {
1108         if (isBig)
1109         {
1110             abandonRef();
1111         }
1112     }
1113 
abandonRefSmallFixedArray1114     @trusted private void abandonRef()
1115     {
1116         assert(isBig);
1117         if (--big.refcount == 0)
1118         {
1119             pureFree(big);
1120             _sizeMask = 0;
1121             assert(!isBig);
1122         }
1123     }
1124 }
1125 
1126 @system unittest
1127 {
1128     alias SA = SmallFixedArray!(int, 2);
create(int[]data)1129     SA create(int[] data)
1130     {
1131         SA a = SA(data.length);
1132         a.mutate((slice) { slice[] = data[]; });
1133         assert(a.internalSlice == data);
1134         return a;
1135     }
1136 
1137     {
1138         SA a;
1139         a = SA(1);
1140         assert(a.length == 1);
1141         a = SA.init;
1142         assert(a.length == 0);
1143     }
1144 
1145     {
1146         SA a, b, c, d;
1147         assert(a.length == 0);
1148         assert(a.internalSlice == b.internalSlice);
1149         a = create([1]);
1150         assert(a.internalSlice == [1]);
1151         b = create([2, 3]);
1152         assert(b.internalSlice == [2, 3]);
1153         c = create([3, 4, 5]);
1154         d = create([5, 6, 7, 8]);
1155         assert(c.isBig);
1156         a = c;
1157         assert(a.isBig);
1158         assert(a.big is c.big);
1159         assert(a.big.refcount == 2);
1160         assert(a.internalSlice == [3, 4, 5]);
1161         assert(c.internalSlice == [3, 4, 5]);
1162         a = b;
1163         assert(!a.isBig);
1164         assert(a.internalSlice == [2, 3]);
1165         assert(c.big.refcount == 1);
1166         a = c;
1167         assert(c.big.refcount == 2);
1168 
1169         // mutate copies on write if ref-count is not 1
1170         a.mutate((slice){ slice[] = 1; });
1171         assert(a.internalSlice == [1, 1, 1]);
1172         assert(c.internalSlice == [3, 4, 5]);
1173         assert(a.isBig && c.isBig);
1174         assert(a.big.refcount == 1);
1175         assert(c.big.refcount == 1);
1176 
1177         auto e = d;
1178         assert(e.big.refcount == 2);
1179         auto f = d;
1180         f = a;
1181         assert(f.isBig);
1182         assert(f.internalSlice == [1, 1, 1]);
1183         assert(f.big.refcount == 2); // a & f
1184         assert(e.big.refcount == 2); // d & e
1185         a = c;
1186         assert(f.big.refcount == 1); // f
1187         assert(e.big.refcount == 2); // d & e
1188         a = a;
1189         a = a;
1190         a = a;
1191         assert(a.big.refcount == 2); // a & c
1192     }
1193 }
1194