1 /*
2 Implementation of std.regex IR, an intermediate representation
3 of a regular expression pattern.
4
5 This is a common ground between frontend regex component (parser)
6 and backend components - generators, matchers and other "filters".
7 */
8 module std.regex.internal.ir;
9
10 package(std.regex):
11
12 import std.exception, std.meta, std.range.primitives, std.traits, std.uni;
13
14 debug(std_regex_parser) import std.stdio;
15 // just a common trait, may be moved elsewhere
16 alias BasicElementOf(Range) = Unqual!(ElementEncodingType!Range);
17
18 enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD';
19
20 // heuristic value determines maximum CodepointSet length suitable for linear search
21 enum maxCharsetUsed = 6;
22
23 // another variable to tweak behavior of caching generated Tries for character classes
24 enum maxCachedMatchers = 8;
25
26 alias Trie = CodepointSetTrie!(13, 8);
27 alias makeTrie = codepointSetTrie!(13, 8);
28
29 CharMatcher[CodepointSet] matcherCache;
30
31 //accessor with caching
getMatcher(CodepointSet set)32 @trusted CharMatcher getMatcher(CodepointSet set)
33 {
34 // almost all properties of AA are not @safe
35 // https://issues.dlang.org/show_bug.cgi?id=6357
36 if (__ctfe || maxCachedMatchers == 0)
37 return CharMatcher(set);
38 else
39 {
40 auto p = set in matcherCache;
41 if (p)
42 return *p;
43 if (matcherCache.length == maxCachedMatchers)
44 {
45 // flush enmatchers in trieCache
46 matcherCache = null;
47 }
48 return (matcherCache[set] = CharMatcher(set));
49 }
50 }
51
wordMatcher()52 @property ref wordMatcher()()
53 {
54 static immutable CharMatcher matcher = CharMatcher(wordCharacter);
55 return matcher;
56 }
57
58 // some special Unicode white space characters
59 private enum NEL = '\u0085', LS = '\u2028', PS = '\u2029';
60
61 //Regular expression engine/parser options:
62 // global - search all nonoverlapping matches in input
63 // casefold - case insensitive matching, do casefolding on match in unicode mode
64 // freeform - ignore whitespace in pattern, to match space use [ ] or \s
65 // multiline - switch ^, $ detect start and end of linesinstead of just start and end of input
66 enum RegexOption: uint {
67 global = 0x1,
68 casefold = 0x2,
69 freeform = 0x4,
70 nonunicode = 0x8,
71 multiline = 0x10,
72 singleline = 0x20
73 }
74 //do not reorder this list
75 alias RegexOptionNames = AliasSeq!('g', 'i', 'x', 'U', 'm', 's');
76 static assert( RegexOption.max < 0x80);
77
package(std)78 package(std) string regexOptionsToString()(uint flags) nothrow pure @safe
79 {
80 flags &= (RegexOption.max << 1) - 1;
81 if (!flags)
82 return "";
83 char[RegexOptionNames.length] buffer = void;
84 size_t pos = 0;
85 foreach (i, flag; __traits(allMembers, RegexOption))
86 if (flags & __traits(getMember, RegexOption, flag))
87 buffer[pos++] = RegexOptionNames[i];
88 return buffer[0 .. pos].idup;
89 }
90
91 // flags that allow guide execution of engine
92 enum RegexInfo : uint { oneShot = 0x80 }
93
94 // IR bit pattern: 0b1_xxxxx_yy
95 // where yy indicates class of instruction, xxxxx for actual operation code
96 // 00: atom, a normal instruction
97 // 01: open, opening of a group, has length of contained IR in the low bits
98 // 10: close, closing of a group, has length of contained IR in the low bits
99 // 11 unused
100 //
101 // Loops with Q (non-greedy, with ? mark) must have the same size / other properties as non Q version
102 // Possible changes:
103 //* merge group, option, infinite/repeat start (to never copy during parsing of (a|b){1,2})
104 //* reorganize groups to make n args easier to find, or simplify the check for groups of similar ops
105 // (like lookaround), or make it easier to identify hotspots.
106
107 enum IR:uint {
108 Char = 0b1_00000_00, //a character
109 Any = 0b1_00001_00, //any character
110 CodepointSet = 0b1_00010_00, //a most generic CodepointSet [...]
111 Trie = 0b1_00011_00, //CodepointSet implemented as Trie
112 //match with any of a consecutive OrChar's in this sequence
113 //(used for case insensitive match)
114 //OrChar holds in upper two bits of data total number of OrChars in this _sequence_
115 //the drawback of this representation is that it is difficult
116 // to detect a jump in the middle of it
117 OrChar = 0b1_00100_00,
118 Nop = 0b1_00101_00, //no operation (padding)
119 End = 0b1_00110_00, //end of program
120 Bol = 0b1_00111_00, //beginning of a line ^
121 Eol = 0b1_01000_00, //end of a line $
122 Wordboundary = 0b1_01001_00, //boundary of a word
123 Notwordboundary = 0b1_01010_00, //not a word boundary
124 Backref = 0b1_01011_00, //backreference to a group (that has to be pinned, i.e. locally unique) (group index)
125 GroupStart = 0b1_01100_00, //start of a group (x) (groupIndex+groupPinning(1bit))
126 GroupEnd = 0b1_01101_00, //end of a group (x) (groupIndex+groupPinning(1bit))
127 Option = 0b1_01110_00, //start of an option within an alternation x | y (length)
128 GotoEndOr = 0b1_01111_00, //end of an option (length of the rest)
129 Bof = 0b1_10000_00, //begining of "file" (string) ^
130 Eof = 0b1_10001_00, //end of "file" (string) $
131 //... any additional atoms here
132
133 OrStart = 0b1_00000_01, //start of alternation group (length)
134 OrEnd = 0b1_00000_10, //end of the or group (length,mergeIndex)
135 //with this instruction order
136 //bit mask 0b1_00001_00 could be used to test/set greediness
137 InfiniteStart = 0b1_00001_01, //start of an infinite repetition x* (length)
138 InfiniteEnd = 0b1_00001_10, //end of infinite repetition x* (length,mergeIndex)
139 InfiniteQStart = 0b1_00010_01, //start of a non eager infinite repetition x*? (length)
140 InfiniteQEnd = 0b1_00010_10, //end of non eager infinite repetition x*? (length,mergeIndex)
141 InfiniteBloomStart = 0b1_00011_01, //start of an filtered infinite repetition x* (length)
142 InfiniteBloomEnd = 0b1_00011_10, //end of filtered infinite repetition x* (length,mergeIndex)
143 RepeatStart = 0b1_00100_01, //start of a {n,m} repetition (length)
144 RepeatEnd = 0b1_00100_10, //end of x{n,m} repetition (length,step,minRep,maxRep)
145 RepeatQStart = 0b1_00101_01, //start of a non eager x{n,m}? repetition (length)
146 RepeatQEnd = 0b1_00101_10, //end of non eager x{n,m}? repetition (length,step,minRep,maxRep)
147
148 //
149 LookaheadStart = 0b1_00110_01, //begin of the lookahead group (length)
150 LookaheadEnd = 0b1_00110_10, //end of a lookahead group (length)
151 NeglookaheadStart = 0b1_00111_01, //start of a negative lookahead (length)
152 NeglookaheadEnd = 0b1_00111_10, //end of a negative lookahead (length)
153 LookbehindStart = 0b1_01000_01, //start of a lookbehind (length)
154 LookbehindEnd = 0b1_01000_10, //end of a lookbehind (length)
155 NeglookbehindStart = 0b1_01001_01, //start of a negative lookbehind (length)
156 NeglookbehindEnd = 0b1_01001_10, //end of negative lookbehind (length)
157 }
158
159 //a shorthand for IR length - full length of specific opcode evaluated at compile time
IRL(IR code)160 template IRL(IR code)
161 {
162 enum uint IRL = lengthOfIR(code);
163 }
164 static assert(IRL!(IR.LookaheadStart) == 3);
165
166 //how many parameters follow the IR, should be optimized fixing some IR bits
immediateParamsIR(IR i)167 int immediateParamsIR(IR i) @safe pure nothrow @nogc
168 {
169 switch (i)
170 {
171 case IR.OrEnd,IR.InfiniteEnd,IR.InfiniteQEnd:
172 return 1; // merge table index
173 case IR.InfiniteBloomEnd:
174 return 2; // bloom filter index + merge table index
175 case IR.RepeatEnd, IR.RepeatQEnd:
176 return 4;
177 case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
178 return 2; // start-end of captures used
179 default:
180 return 0;
181 }
182 }
183
184 //full length of IR instruction inlcuding all parameters that might follow it
lengthOfIR(IR i)185 int lengthOfIR(IR i) @safe pure nothrow @nogc
186 {
187 return 1 + immediateParamsIR(i);
188 }
189
190 //full length of the paired IR instruction inlcuding all parameters that might follow it
lengthOfPairedIR(IR i)191 int lengthOfPairedIR(IR i) @safe pure nothrow @nogc
192 {
193 return 1 + immediateParamsIR(pairedIR(i));
194 }
195
196 //if the operation has a merge point (this relies on the order of the ops)
hasMerge(IR i)197 bool hasMerge(IR i) @safe pure nothrow @nogc
198 {
199 return (i&0b11)==0b10 && i <= IR.RepeatQEnd;
200 }
201
202 //is an IR that opens a "group"
isStartIR(IR i)203 bool isStartIR(IR i) @safe pure nothrow @nogc
204 {
205 return (i&0b11)==0b01;
206 }
207
208 //is an IR that ends a "group"
isEndIR(IR i)209 bool isEndIR(IR i) @safe pure nothrow @nogc
210 {
211 return (i&0b11)==0b10;
212 }
213
214 //is a standalone IR
isAtomIR(IR i)215 bool isAtomIR(IR i) @safe pure nothrow @nogc
216 {
217 return (i&0b11)==0b00;
218 }
219
220 //makes respective pair out of IR i, swapping start/end bits of instruction
pairedIR(IR i)221 IR pairedIR(IR i) @safe pure nothrow @nogc
222 {
223 assert(isStartIR(i) || isEndIR(i));
224 return cast(IR) (i ^ 0b11);
225 }
226
227 //encoded IR instruction
228 @safe pure
229 struct Bytecode
230 {
231 uint raw;
232 //natural constraints
233 enum maxSequence = 2+4;
234 enum maxData = 1 << 22;
235 enum maxRaw = 1 << 31;
236
237 @safe pure:
thisBytecode238 this(IR code, uint data)
239 {
240 assert(data < (1 << 22) && code < 256);
241 raw = code << 24 | data;
242 }
243
thisBytecode244 this(IR code, uint data, uint seq)
245 {
246 assert(data < (1 << 22) && code < 256 );
247 assert(seq >= 2 && seq < maxSequence);
248 raw = code << 24 | (seq - 2)<<22 | data;
249 }
250
251 //store raw data
fromRawBytecode252 static Bytecode fromRaw(uint data)
253 {
254 Bytecode t;
255 t.raw = data;
256 return t;
257 }
258
259 // bit twiddling helpers
260 // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
dataBytecode261 @property uint data()() const { return raw & 0x003f_ffff; }
262
dataBytecode263 @property void data()(uint val)
264 {
265 raw = (raw & ~0x003f_ffff) | (val & 0x003f_ffff);
266 }
267
268 // ditto
269 // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
sequenceBytecode270 @property uint sequence()() const { return 2 + (raw >> 22 & 0x3); }
271
272 // ditto
273 // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
codeBytecode274 @property IR code()() const { return cast(IR)(raw >> 24); }
275
276 //ditto
hotspotBytecode277 @property bool hotspot() const { return hasMerge(code); }
278
279 //test the class of this instruction
isAtomBytecode280 @property bool isAtom() const { return isAtomIR(code); }
281
282 //ditto
isStartBytecode283 @property bool isStart() const { return isStartIR(code); }
284
285 //ditto
isEndBytecode286 @property bool isEnd() const { return isEndIR(code); }
287
288 //number of arguments for this instruction
argsBytecode289 @property int args() const { return immediateParamsIR(code); }
290
291 //mark this GroupStart or GroupEnd as referenced in backreference
setBackrefenceBytecode292 void setBackrefence()
293 {
294 assert(code == IR.GroupStart || code == IR.GroupEnd);
295 raw = raw | 1 << 23;
296 }
297
298 //is referenced
backreferenceBytecode299 @property bool backreference() const
300 {
301 assert(code == IR.GroupStart || code == IR.GroupEnd);
302 return cast(bool)(raw & 1 << 23);
303 }
304
305 //mark as local reference (for backrefs in lookarounds)
setLocalRefBytecode306 void setLocalRef()
307 {
308 assert(code == IR.Backref);
309 raw = raw | 1 << 23;
310 }
311
312 //is a local ref
localRefBytecode313 @property bool localRef() const
314 {
315 assert(code == IR.Backref);
316 return cast(bool)(raw & 1 << 23);
317 }
318
319 //human readable name of instruction
mnemonicBytecode320 @trusted @property string mnemonic()() const
321 {//@@@BUG@@@ to is @system
322 import std.conv : to;
323 return to!string(code);
324 }
325
326 //full length of instruction
lengthBytecode327 @property uint length() const
328 {
329 return lengthOfIR(code);
330 }
331
332 //full length of respective start/end of this instruction
pairedLengthBytecode333 @property uint pairedLength() const
334 {
335 return lengthOfPairedIR(code);
336 }
337
338 //returns bytecode of paired instruction (assuming this one is start or end)
pairedBytecode339 @property Bytecode paired() const
340 {//depends on bit and struct layout order
341 assert(isStart || isEnd);
342 return Bytecode.fromRaw(raw ^ 0b11 << 24);
343 }
344
345 //gets an index into IR block of the respective pair
indexOfPairBytecode346 uint indexOfPair(uint pc) const
347 {
348 assert(isStart || isEnd);
349 return isStart ? pc + data + length : pc - data - lengthOfPairedIR(code);
350 }
351 }
352
353 static assert(Bytecode.sizeof == 4);
354
355
356 //index entry structure for name --> number of submatch
357 struct NamedGroup
358 {
359 string name;
360 uint group;
361 }
362
363 //holds pair of start-end markers for a submatch
Group(DataIndex)364 struct Group(DataIndex)
365 {
366 DataIndex begin = DataIndex.max;
367 DataIndex end = DataIndex.min;
368
369 bool opCast(T : bool)() const
370 {
371 return begin <= end;
372 }
373
374 @trusted string toString()() const
375 {
376 if (begin < end)
377 return "(unmatched)";
378 import std.array : appender;
379 import std.format.write : formattedWrite;
380 auto a = appender!string();
381 formattedWrite(a, "%s..%s", begin, end);
382 return a.data;
383 }
384 }
385
386 //debugging tool, prints out instruction along with opcodes
387 @trusted string disassemble(in Bytecode[] irb, uint pc, in NamedGroup[] dict=[])
388 {
389 import std.array : appender;
390 import std.format.write : formattedWrite;
391 auto output = appender!string();
392 formattedWrite(output,"%s", irb[pc].mnemonic);
393 switch (irb[pc].code)
394 {
395 case IR.Char:
396 formattedWrite(output, " %s (0x%x)",cast(dchar) irb[pc].data, irb[pc].data);
397 break;
398 case IR.OrChar:
399 formattedWrite(output, " %s (0x%x) seq=%d", cast(dchar) irb[pc].data, irb[pc].data, irb[pc].sequence);
400 break;
401 case IR.RepeatStart, IR.InfiniteStart, IR.InfiniteBloomStart,
402 IR.Option, IR.GotoEndOr, IR.OrStart:
403 //forward-jump instructions
404 uint len = irb[pc].data;
405 formattedWrite(output, " pc=>%u", pc+len+IRL!(IR.RepeatStart));
406 break;
407 case IR.RepeatEnd, IR.RepeatQEnd: //backward-jump instructions
408 uint len = irb[pc].data;
409 formattedWrite(output, " pc=>%u min=%u max=%u step=%u",
410 pc - len, irb[pc + 3].raw, irb[pc + 4].raw, irb[pc + 2].raw);
411 break;
412 case IR.InfiniteEnd, IR.InfiniteQEnd, IR.InfiniteBloomEnd, IR.OrEnd: //ditto
413 uint len = irb[pc].data;
414 formattedWrite(output, " pc=>%u", pc-len);
415 break;
416 case IR.LookaheadEnd, IR.NeglookaheadEnd: //ditto
417 uint len = irb[pc].data;
418 formattedWrite(output, " pc=>%u", pc-len);
419 break;
420 case IR.GroupStart, IR.GroupEnd:
421 uint n = irb[pc].data;
422 string name;
423 foreach (v;dict)
424 if (v.group == n)
425 {
426 name = "'"~v.name~"'";
427 break;
428 }
429 formattedWrite(output, " %s #%u " ~ (irb[pc].backreference ? "referenced" : ""),
430 name, n);
431 break;
432 case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
433 uint len = irb[pc].data;
434 uint start = irb[pc+1].raw, end = irb[pc+2].raw;
435 formattedWrite(output, " pc=>%u [%u..%u]", pc + len + IRL!(IR.LookaheadStart), start, end);
436 break;
437 case IR.Backref: case IR.CodepointSet: case IR.Trie:
438 uint n = irb[pc].data;
439 formattedWrite(output, " %u", n);
440 if (irb[pc].code == IR.Backref)
441 formattedWrite(output, " %s", irb[pc].localRef ? "local" : "global");
442 break;
443 default://all data-free instructions
444 }
445 if (irb[pc].hotspot)
446 formattedWrite(output, " Hotspot %u", irb[pc+1].raw);
447 return output.data;
448 }
449
450 //disassemble the whole chunk
printBytecode()451 @trusted void printBytecode()(in Bytecode[] slice, in NamedGroup[] dict=[])
452 {
453 import std.stdio : writeln;
454 for (uint pc=0; pc<slice.length; pc += slice[pc].length)
455 writeln("\t", disassemble(slice, pc, dict));
456 }
457
458 // Encapsulates memory management, explicit ref counting
459 // and the exact type of engine created
460 // there is a single instance per engine combination type x Char
461 // In future may also maintain a (TLS?) cache of memory
MatcherFactory(Char)462 interface MatcherFactory(Char)
463 {
464 @safe:
465 Matcher!Char create(const ref Regex!Char, in Char[] input) const;
466 Matcher!Char dup(Matcher!Char m, in Char[] input) const;
467 size_t incRef(Matcher!Char m) const;
468 size_t decRef(Matcher!Char m) const;
469 }
470
471 // Only memory management, no compile-time vs run-time specialities
472 abstract class GenericFactory(alias EngineType, Char) : MatcherFactory!Char
473 {
474 import core.memory : pureFree;
475 import std.internal.memory : enforceMalloc;
476 import core.memory : GC;
477 // round up to next multiple of size_t for alignment purposes
478 enum classSize = (__traits(classInstanceSize, EngineType!Char) + size_t.sizeof - 1) & ~(size_t.sizeof - 1);
479
480 EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const;
481
482 override EngineType!Char create(const ref Regex!Char re, in Char[] input) const @trusted
483 {
484 immutable size = EngineType!Char.initialMemory(re) + classSize;
485 auto memory = enforceMalloc(size)[0 .. size];
486 scope(failure) pureFree(memory.ptr);
487 GC.addRange(memory.ptr, classSize);
488 auto engine = construct(re, input, memory);
489 assert(engine.refCount == 1);
490 assert(cast(void*) engine == memory.ptr);
491 return engine;
492 }
493
494 override EngineType!Char dup(Matcher!Char engine, in Char[] input) const @trusted
495 {
496 immutable size = EngineType!Char.initialMemory(engine.pattern) + classSize;
497 auto memory = enforceMalloc(size)[0 .. size];
498 scope(failure) pureFree(memory.ptr);
499 auto copy = construct(engine.pattern, input, memory);
500 GC.addRange(memory.ptr, classSize);
501 engine.dupTo(copy, memory[classSize .. size]);
502 assert(copy.refCount == 1);
503 return copy;
504 }
505
506 override size_t incRef(Matcher!Char m) const
507 {
508 return ++m.refCount;
509 }
510
511 override size_t decRef(Matcher!Char m) const @trusted
512 {
513 assert(m.refCount != 0);
514 auto cnt = --m.refCount;
515 if (cnt == 0)
516 {
517 void* ptr = cast(void*) m;
518 GC.removeRange(ptr);
519 pureFree(ptr);
520 }
521 return cnt;
522 }
523 }
524
525 // A factory for run-time engines
526 class RuntimeFactory(alias EngineType, Char) : GenericFactory!(EngineType, Char)
527 {
528 override EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const
529 {
530 import core.lifetime : emplace;
531 return emplace!(EngineType!Char)(memory[0 .. classSize],
532 re, Input!Char(input), memory[classSize .. $]);
533 }
534 }
535
536 // A factory for compile-time engine
537 class CtfeFactory(alias EngineType, Char, alias func) : GenericFactory!(EngineType, Char)
538 {
539 override EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const
540 {
541 import core.lifetime : emplace;
542 return emplace!(EngineType!Char)(memory[0 .. classSize],
543 re, &func, Input!Char(input), memory[classSize .. $]);
544 }
545 }
546
defaultFactoryImpl(Char)547 private auto defaultFactoryImpl(Char)(const ref Regex!Char re)
548 {
549 import std.regex.internal.backtracking : BacktrackingMatcher;
550 import std.regex.internal.thompson : ThompsonMatcher;
551 import std.algorithm.searching : canFind;
552 static MatcherFactory!Char backtrackingFactory;
553 static MatcherFactory!Char thompsonFactory;
554 if (re.backrefed.canFind!"a != 0")
555 {
556 if (backtrackingFactory is null)
557 backtrackingFactory = new RuntimeFactory!(BacktrackingMatcher, Char);
558 return backtrackingFactory;
559 }
560 else
561 {
562 if (thompsonFactory is null)
563 thompsonFactory = new RuntimeFactory!(ThompsonMatcher, Char);
564 return thompsonFactory;
565 }
566 }
567
568 // Used to generate a pure wrapper for defaultFactoryImpl. Based on the example in
569 // the std.traits.SetFunctionAttributes documentation.
570 auto assumePureFunction(T)(T t)
571 if (isFunctionPointer!T)
572 {
573 enum attrs = functionAttributes!T | FunctionAttribute.pure_;
574 return cast(SetFunctionAttributes!(T, functionLinkage!T, attrs)) t;
575 }
576
577 // A workaround for R-T enum re = regex(...)
defaultFactory(Char)578 template defaultFactory(Char)
579 {
580 // defaultFactory is constructed as a safe, pure wrapper over defaultFactoryImpl.
581 // It can be faked as pure because the static mutable variables are used to cache
582 // the key and character matcher. The technique used avoids delegates and GC.
583 @property MatcherFactory!Char defaultFactory(const ref Regex!Char re) @safe pure
584 {
585 static auto impl(const ref Regex!Char re)
586 {
587 return defaultFactoryImpl(re);
588 }
589
590 static auto pureImpl(const ref Regex!Char re) @trusted
591 {
592 auto p = assumePureFunction(&impl);
593 return p(re);
594 }
595
596 return pureImpl(re);
597 }
598 }
599
600 // Defining it as an interface has the undesired side-effect:
601 // casting any class to an interface silently adjusts pointer to point to a nested vtbl
Matcher(Char)602 abstract class Matcher(Char)
603 {
604 abstract:
605 // Get a (next) match
606 int match(Group!size_t[] matches) pure;
607 // This only maintains internal ref-count,
608 // deallocation happens inside MatcherFactory
609 @property ref size_t refCount() @safe;
610 // Copy internal state to another engine, using memory arena 'memory'
611 void dupTo(Matcher!Char m, void[] memory);
612 // The pattern loaded
613 @property ref const(Regex!Char) pattern() @safe;
614 // Re-arm the engine with new Input
615 Matcher rearm(in Char[] stream);
616 }
617
618 /++
619 `Regex` object holds regular expression pattern in compiled form.
620 Instances of this object are constructed via calls to `regex`.
621 This is an intended form for caching and storage of frequently
622 used regular expressions.
623 +/
Regex(Char)624 struct Regex(Char)
625 {
626 //temporary workaround for identifier lookup
627 CodepointSet[] charsets; //
628 Bytecode[] ir; //compiled bytecode of pattern
629
630
631 @safe @property bool empty() const nothrow { return ir is null; }
632 /++
633 `namedCaptures` returns a range of all named captures in a given regular expression.
634 +/
635 @safe @property auto namedCaptures()
636 {
637 static struct NamedGroupRange
638 {
639 private:
640 const(NamedGroup)[] groups;
641 size_t start;
642 size_t end;
643 public:
644 this(const(NamedGroup)[] g, size_t s, size_t e)
645 {
646 assert(s <= e);
647 assert(e <= g.length);
648 groups = g;
649 start = s;
650 end = e;
651 }
652
653 @property string front() { return groups[start].name; }
654 @property string back() { return groups[end-1].name; }
655 @property bool empty() { return start >= end; }
656 @property size_t length() { return end - start; }
657 alias opDollar = length;
658 @property NamedGroupRange save()
659 {
660 return NamedGroupRange(groups, start, end);
661 }
662 void popFront() { assert(!empty); start++; }
663 void popBack() { assert(!empty); end--; }
664 string opIndex()(size_t i)
665 {
666 assert(start + i < end,
667 "Requested named group is out of range.");
668 return groups[start+i].name;
669 }
670 NamedGroupRange opSlice(size_t low, size_t high) {
671 assert(low <= high);
672 assert(start + high <= end);
673 return NamedGroupRange(groups, start + low, start + high);
674 }
675 NamedGroupRange opSlice() { return this.save; }
676 }
677 return NamedGroupRange(dict, 0, dict.length);
678 }
679
680 package(std.regex):
681 import std.regex.internal.kickstart : Kickstart; //TODO: get rid of this dependency
682 const(NamedGroup)[] dict; // maps name -> user group number
683 uint ngroup; // number of internal groups
684 uint maxCounterDepth; // max depth of nested {n,m} repetitions
685 uint hotspotTableSize; // number of entries in merge table
686 uint threadCount; // upper bound on number of Thompson VM threads
687 uint flags; // global regex flags
688 public const(CharMatcher)[] matchers; // tables that represent character sets
689 public const(BitTable)[] filters; // bloom filters for conditional loops
690 uint[] backrefed; // bit array of backreferenced submatches
691 Kickstart!Char kickstart;
692 MatcherFactory!Char factory; // produces optimal matcher for this pattern
693 immutable(Char)[] pattern; // copy of pattern to serve as cache key
694
695 const(Regex) withFactory(MatcherFactory!Char factory) pure const @trusted
696 {
697 auto r = cast() this;
698 r.factory = factory;
699 return r;
700 }
701
702 const(Regex) withFlags(uint newFlags) pure const @trusted
703 {
704 auto r = cast() this;
705 r.flags = newFlags;
706 return r;
707 }
708
709 const(Regex) withCode(const(Bytecode)[] code) pure const @trusted
710 {
711 auto r = cast() this;
712 r.ir = code.dup; // TODO: sidestep const instead?
713 return r;
714 }
715
716 const(Regex) withNGroup(uint nGroup) pure const @trusted
717 {
718 auto r = cast() this;
719 r.ngroup = nGroup;
720 return r;
721 }
722
723 //bit access helper
724 uint isBackref(uint n)
725 {
726 if (n/32 >= backrefed.length)
727 return 0;
728 return backrefed[n / 32] & (1 << (n & 31));
729 }
730
731 //check if searching is not needed
732 void checkIfOneShot()
733 {
734 L_CheckLoop:
735 for (uint i = 0; i < ir.length; i += ir[i].length)
736 {
737 switch (ir[i].code)
738 {
739 case IR.Bof:
740 flags |= RegexInfo.oneShot;
741 break L_CheckLoop;
742 case IR.GroupStart, IR.GroupEnd, IR.Bol, IR.Eol, IR.Eof,
743 IR.Wordboundary, IR.Notwordboundary:
744 break;
745 default:
746 break L_CheckLoop;
747 }
748 }
749 }
750
751 //print out disassembly a program's IR
752 @trusted debug(std_regex_parser) void print() const
753 {//@@@BUG@@@ write is system
754 for (uint i = 0; i < ir.length; i += ir[i].length)
755 {
756 writefln("%d\t%s ", i, disassemble(ir, i, dict));
757 }
758 writeln("Total merge table size: ", hotspotTableSize);
759 writeln("Max counter nesting depth: ", maxCounterDepth);
760 }
761
762 public string toString()() const
763 {
764 import std.format : format;
765 static if (is(typeof(pattern) : string))
766 alias patternString = pattern;
767 else
768 {
769 import std.conv : to;
770 auto patternString = conv.to!string(pattern);
771 }
772 auto quotedEscapedPattern = format("%(%s %)", [patternString]);
773 auto flagString = regexOptionsToString(flags);
774 return "Regex!" ~ Char.stringof ~ "(" ~ quotedEscapedPattern ~ ", \"" ~ flagString ~ "\")";
775 }
776 }
777
778 // The stuff below this point is temporarrily part of IR module
779 // but may need better place in the future (all internals)
780 package(std.regex):
781
782 //Simple UTF-string abstraction compatible with stream interface
783 struct Input(Char)
784 if (is(Char :dchar))
785 {
786 import std.utf : decode;
787 alias DataIndex = size_t;
788 enum bool isLoopback = false;
789 alias String = const(Char)[];
790 String _origin;
791 size_t _index;
792
793 //constructs Input object out of plain string
794 this(String input, size_t idx = 0)
795 {
796 _origin = input;
797 _index = idx;
798 }
799
800 //codepoint at current stream position
pragma(inline,true)801 pragma(inline, true) bool nextChar(ref dchar res, ref size_t pos)
802 {
803 pos = _index;
804 // DMD's inliner hates multiple return functions
805 // but can live with single statement if/else bodies
806 bool n = !(_index == _origin.length);
807 if (n)
808 res = decode(_origin, _index);
809 return n;
810 }
atEnd()811 @property bool atEnd(){
812 return _index == _origin.length;
813 }
search(Kickstart)814 bool search(Kickstart)(ref const Kickstart kick, ref dchar res, ref size_t pos)
815 {
816 size_t idx = kick.search(_origin, _index);
817 _index = idx;
818 return nextChar(res, pos);
819 }
820
821 //index of at End position
lastIndex()822 @property size_t lastIndex(){ return _origin.length; }
823
824 //support for backtracker engine, might not be present
reset(size_t index)825 void reset(size_t index){ _index = index; }
826
opSlice(size_t start,size_t end)827 String opSlice(size_t start, size_t end){ return _origin[start .. end]; }
828
loopBack(size_t index)829 auto loopBack(size_t index){ return BackLooper!Input(this, index); }
830 }
831
BackLooperImpl(Input)832 struct BackLooperImpl(Input)
833 {
834 import std.utf : strideBack;
835 alias DataIndex = size_t;
836 alias String = Input.String;
837 enum bool isLoopback = true;
838 String _origin;
839 size_t _index;
840 this(Input input, size_t index)
841 {
842 _origin = input._origin;
843 _index = index;
844 }
845 this(String input)
846 {
847 _origin = input;
848 _index = input.length;
849 }
850 @trusted bool nextChar(ref dchar res,ref size_t pos)
851 {
852 pos = _index;
853 if (_index == 0)
854 return false;
855
856 res = _origin[0.._index].back;
857 _index -= strideBack(_origin, _index);
858
859 return true;
860 }
861 @property atEnd(){ return _index == 0 || _index == strideBack(_origin, _index); }
862 auto loopBack(size_t index){ return Input(_origin, index); }
863
864 //support for backtracker engine, might not be present
865 //void reset(size_t index){ _index = index ? index-std.utf.strideBack(_origin, index) : 0; }
866 void reset(size_t index){ _index = index; }
867
868 String opSlice(size_t start, size_t end){ return _origin[end .. start]; }
869 //index of at End position
870 @property size_t lastIndex(){ return 0; }
871 }
872
BackLooper(E)873 template BackLooper(E)
874 {
875 static if (is(E : BackLooperImpl!U, U))
876 {
877 alias BackLooper = U;
878 }
879 else
880 {
881 alias BackLooper = BackLooperImpl!E;
882 }
883 }
884
885 //both helpers below are internal, on its own are quite "explosive"
886 //unsafe, no initialization of elements
mallocArray(T)887 @system pure T[] mallocArray(T)(size_t len)
888 {
889 import core.memory : pureMalloc;
890 return (cast(T*) pureMalloc(len * T.sizeof))[0 .. len];
891 }
892
893 //very unsafe, no initialization
arrayInChunk(T)894 @system T[] arrayInChunk(T)(size_t len, ref void[] chunk)
895 {
896 auto ret = (cast(T*) chunk.ptr)[0 .. len];
897 chunk = chunk[len * T.sizeof .. $];
898 return ret;
899 }
900
901 //
lookupNamedGroup(String)902 @trusted uint lookupNamedGroup(String)(const(NamedGroup)[] dict, String name)
903 {//equal is @system?
904 import std.algorithm.comparison : equal;
905 import std.algorithm.iteration : map;
906 import std.conv : text;
907 import std.range : assumeSorted;
908
909 auto fnd = assumeSorted!"cmp(a,b) < 0"(map!"a.name"(dict)).lowerBound(name).length;
910 enforce(fnd < dict.length && equal(dict[fnd].name, name),
911 text("no submatch named ", name));
912 return dict[fnd].group;
913 }
914
915 // whether ch is one of unicode newline sequences
916 // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
endOfLine()917 bool endOfLine()(dchar front, bool seenCr)
918 {
919 return ((front == '\n') ^ seenCr) || front == '\r'
920 || front == NEL || front == LS || front == PS;
921 }
922
923 // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
startOfLine()924 bool startOfLine()(dchar back, bool seenNl)
925 {
926 return ((back == '\r') ^ seenNl) || back == '\n'
927 || back == NEL || back == LS || back == PS;
928 }
929
930 ///Exception object thrown in case of errors during regex compilation.
931 public class RegexException : Exception
932 {
933 mixin basicExceptionCtors;
934 }
935
936 // simple 128-entry bit-table used with a hash function
937 struct BitTable {
938 uint[4] filter;
939
thisBitTable940 this(CodepointSet set){
941 foreach (iv; set.byInterval)
942 {
943 foreach (v; iv.a .. iv.b)
944 add(v);
945 }
946 }
947
addBitTable948 void add()(dchar ch){
949 immutable i = index(ch);
950 filter[i >> 5] |= 1<<(i & 31);
951 }
952 // non-zero -> might be present, 0 -> absent
opIndexBitTable953 bool opIndex()(dchar ch) const{
954 immutable i = index(ch);
955 return (filter[i >> 5]>>(i & 31)) & 1;
956 }
957
indexBitTable958 static uint index()(dchar ch){
959 return ((ch >> 7) ^ ch) & 0x7F;
960 }
961 }
962
963 struct CharMatcher {
964 BitTable ascii; // fast path for ASCII
965 Trie trie; // slow path for Unicode
966
this(CodepointSet set)967 this(CodepointSet set)
968 {
969 auto asciiSet = set & unicode.ASCII;
970 ascii = BitTable(asciiSet);
971 trie = makeTrie(set);
972 }
973
opIndex()974 bool opIndex()(dchar ch) const
975 {
976 if (ch < 0x80)
977 return ascii[ch];
978 else
979 return trie[ch];
980 }
981 }
982
983 // Internal non-resizeble array, switches between inline storage and CoW
984 // POD-only
985 struct SmallFixedArray(T, uint SMALL=3)
986 if (!hasElaborateDestructor!T)
987 {
988 import std.internal.memory : enforceMalloc;
989 import core.memory : pureFree;
990 static struct Payload
991 {
992 size_t refcount;
993 T[0] placeholder;
inoutSmallFixedArray::Payload994 inout(T)* ptr() inout { return placeholder.ptr; }
995 }
996 static assert(Payload.sizeof == size_t.sizeof);
997 union
998 {
999 Payload* big;
1000 T[SMALL] small;
1001 }
1002 size_t _sizeMask;
1003 enum BIG_MASK = size_t(1)<<(8*size_t.sizeof-1);
1004 enum SIZE_MASK = ~BIG_MASK;
1005
isBigSmallFixedArray1006 @property bool isBig() const { return (_sizeMask & BIG_MASK) != 0; }
lengthSmallFixedArray1007 @property size_t length() const { return _sizeMask & SIZE_MASK; }
1008
thisSmallFixedArray1009 this(size_t size)
1010 {
1011 if (size <= SMALL)
1012 {
1013 small[] = T.init;
1014 _sizeMask = size;
1015 }
1016 else
1017 {
1018 big = cast(Payload*) enforceMalloc(Payload.sizeof + T.sizeof*size);
1019 big.refcount = 1;
1020 _sizeMask = size | BIG_MASK;
1021 }
1022 }
1023
inoutSmallFixedArray1024 private @trusted @property inout(T)[] internalSlice() inout
1025 {
1026 return isBig ? big.ptr[0 .. length] : small[0 .. length];
1027 }
1028
thisSmallFixedArray1029 this(this)
1030 {
1031 if (isBig)
1032 {
1033 big.refcount++;
1034 }
1035 }
1036
opEqualsSmallFixedArray1037 bool opEquals(SmallFixedArray a)
1038 {
1039 return internalSlice[] == a.internalSlice[];
1040 }
1041
toHashSmallFixedArray1042 size_t toHash() const
1043 {
1044 return hashOf(internalSlice[]);
1045 }
1046
inoutSmallFixedArray1047 ref inout(T) opIndex(size_t idx) inout
1048 {
1049 return internalSlice[idx];
1050 }
1051
1052 // accesses big to test self-referencing so not @safe
opAssignSmallFixedArray1053 @trusted ref opAssign(SmallFixedArray arr)
1054 {
1055 if (isBig)
1056 {
1057 if (arr.isBig)
1058 {
1059 if (big is arr.big) return this; // self-assign
1060 else
1061 {
1062 abandonRef();
1063 _sizeMask = arr._sizeMask;
1064 big = arr.big;
1065 big.refcount++;
1066 }
1067 }
1068 else
1069 {
1070 abandonRef();
1071 _sizeMask = arr._sizeMask;
1072 small = arr.small;
1073 }
1074 }
1075 else
1076 {
1077 if (arr.isBig)
1078 {
1079 _sizeMask = arr._sizeMask;
1080 big = arr.big;
1081 big.refcount++;
1082 }
1083 else
1084 {
1085 _sizeMask = arr._sizeMask;
1086 small = arr.small;
1087 }
1088 }
1089 return this;
1090 }
1091
mutateSmallFixedArray1092 void mutate(scope void delegate(T[]) pure filler)
1093 {
1094 if (isBig && big.refcount != 1) // copy on write
1095 {
1096 auto oldSizeMask = _sizeMask;
1097 auto newbig = cast(Payload*) enforceMalloc(Payload.sizeof + T.sizeof*length);
1098 newbig.refcount = 1;
1099 abandonRef();
1100 big = newbig;
1101 _sizeMask = oldSizeMask;
1102 }
1103 filler(internalSlice);
1104 }
1105
~thisSmallFixedArray1106 ~this()
1107 {
1108 if (isBig)
1109 {
1110 abandonRef();
1111 }
1112 }
1113
abandonRefSmallFixedArray1114 @trusted private void abandonRef()
1115 {
1116 assert(isBig);
1117 if (--big.refcount == 0)
1118 {
1119 pureFree(big);
1120 _sizeMask = 0;
1121 assert(!isBig);
1122 }
1123 }
1124 }
1125
1126 @system unittest
1127 {
1128 alias SA = SmallFixedArray!(int, 2);
create(int[]data)1129 SA create(int[] data)
1130 {
1131 SA a = SA(data.length);
1132 a.mutate((slice) { slice[] = data[]; });
1133 assert(a.internalSlice == data);
1134 return a;
1135 }
1136
1137 {
1138 SA a;
1139 a = SA(1);
1140 assert(a.length == 1);
1141 a = SA.init;
1142 assert(a.length == 0);
1143 }
1144
1145 {
1146 SA a, b, c, d;
1147 assert(a.length == 0);
1148 assert(a.internalSlice == b.internalSlice);
1149 a = create([1]);
1150 assert(a.internalSlice == [1]);
1151 b = create([2, 3]);
1152 assert(b.internalSlice == [2, 3]);
1153 c = create([3, 4, 5]);
1154 d = create([5, 6, 7, 8]);
1155 assert(c.isBig);
1156 a = c;
1157 assert(a.isBig);
1158 assert(a.big is c.big);
1159 assert(a.big.refcount == 2);
1160 assert(a.internalSlice == [3, 4, 5]);
1161 assert(c.internalSlice == [3, 4, 5]);
1162 a = b;
1163 assert(!a.isBig);
1164 assert(a.internalSlice == [2, 3]);
1165 assert(c.big.refcount == 1);
1166 a = c;
1167 assert(c.big.refcount == 2);
1168
1169 // mutate copies on write if ref-count is not 1
1170 a.mutate((slice){ slice[] = 1; });
1171 assert(a.internalSlice == [1, 1, 1]);
1172 assert(c.internalSlice == [3, 4, 5]);
1173 assert(a.isBig && c.isBig);
1174 assert(a.big.refcount == 1);
1175 assert(c.big.refcount == 1);
1176
1177 auto e = d;
1178 assert(e.big.refcount == 2);
1179 auto f = d;
1180 f = a;
1181 assert(f.isBig);
1182 assert(f.internalSlice == [1, 1, 1]);
1183 assert(f.big.refcount == 2); // a & f
1184 assert(e.big.refcount == 2); // d & e
1185 a = c;
1186 assert(f.big.refcount == 1); // f
1187 assert(e.big.refcount == 2); // d & e
1188 a = a;
1189 a = a;
1190 a = a;
1191 assert(a.big.refcount == 2); // a & c
1192 }
1193 }
1194