1 /* 2 Implementation of std.regex IR, an intermediate representation 3 of a regular expression pattern. 4 5 This is a common ground between frontend regex component (parser) 6 and backend components - generators, matchers and other "filters". 7 */ 8 module std.regex.internal.ir; 9 10 package(std.regex): 11 12 import std.exception, std.meta, std.range.primitives, std.traits, std.uni; 13 14 debug(std_regex_parser) import std.stdio; 15 // just a common trait, may be moved elsewhere 16 alias BasicElementOf(Range) = Unqual!(ElementEncodingType!Range); 17 18 enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD'; 19 20 // heuristic value determines maximum CodepointSet length suitable for linear search 21 enum maxCharsetUsed = 6; 22 23 // another variable to tweak behavior of caching generated Tries for character classes 24 enum maxCachedMatchers = 8; 25 26 alias Trie = CodepointSetTrie!(13, 8); 27 alias makeTrie = codepointSetTrie!(13, 8); 28 29 CharMatcher[CodepointSet] matcherCache; 30 31 //accessor with caching 32 @trusted CharMatcher getMatcher(CodepointSet set) 33 { 34 // almost all properties of AA are not @safe 35 // https://issues.dlang.org/show_bug.cgi?id=6357 36 if (__ctfe || maxCachedMatchers == 0) 37 return CharMatcher(set); 38 else 39 { 40 auto p = set in matcherCache; 41 if (p) 42 return *p; 43 if (matcherCache.length == maxCachedMatchers) 44 { 45 // flush enmatchers in trieCache 46 matcherCache = null; 47 } 48 return (matcherCache[set] = CharMatcher(set)); 49 } 50 } 51 52 @property ref wordMatcher()() 53 { 54 static immutable CharMatcher matcher = CharMatcher(wordCharacter); 55 return matcher; 56 } 57 58 // some special Unicode white space characters 59 private enum NEL = '\u0085', LS = '\u2028', PS = '\u2029'; 60 61 //Regular expression engine/parser options: 62 // global - search all nonoverlapping matches in input 63 // casefold - case insensitive matching, do casefolding on match in unicode mode 64 // freeform - ignore whitespace in pattern, to match space use [ ] or \s 65 // multiline - switch ^, $ detect start and end of linesinstead of just start and end of input 66 enum RegexOption: uint { 67 global = 0x1, 68 casefold = 0x2, 69 freeform = 0x4, 70 nonunicode = 0x8, 71 multiline = 0x10, 72 singleline = 0x20 73 } 74 //do not reorder this list 75 alias RegexOptionNames = AliasSeq!('g', 'i', 'x', 'U', 'm', 's'); 76 static assert( RegexOption.max < 0x80); 77 78 package(std) string regexOptionsToString()(uint flags) nothrow pure @safe 79 { 80 flags &= (RegexOption.max << 1) - 1; 81 if (!flags) 82 return ""; 83 char[RegexOptionNames.length] buffer = void; 84 size_t pos = 0; 85 foreach (i, flag; __traits(allMembers, RegexOption)) 86 if (flags & __traits(getMember, RegexOption, flag)) 87 buffer[pos++] = RegexOptionNames[i]; 88 return buffer[0 .. pos].idup; 89 } 90 91 // flags that allow guide execution of engine 92 enum RegexInfo : uint { oneShot = 0x80 } 93 94 // IR bit pattern: 0b1_xxxxx_yy 95 // where yy indicates class of instruction, xxxxx for actual operation code 96 // 00: atom, a normal instruction 97 // 01: open, opening of a group, has length of contained IR in the low bits 98 // 10: close, closing of a group, has length of contained IR in the low bits 99 // 11 unused 100 // 101 // Loops with Q (non-greedy, with ? mark) must have the same size / other properties as non Q version 102 // Possible changes: 103 //* merge group, option, infinite/repeat start (to never copy during parsing of (a|b){1,2}) 104 //* reorganize groups to make n args easier to find, or simplify the check for groups of similar ops 105 // (like lookaround), or make it easier to identify hotspots. 106 107 enum IR:uint { 108 Char = 0b1_00000_00, //a character 109 Any = 0b1_00001_00, //any character 110 CodepointSet = 0b1_00010_00, //a most generic CodepointSet [...] 111 Trie = 0b1_00011_00, //CodepointSet implemented as Trie 112 //match with any of a consecutive OrChar's in this sequence 113 //(used for case insensitive match) 114 //OrChar holds in upper two bits of data total number of OrChars in this _sequence_ 115 //the drawback of this representation is that it is difficult 116 // to detect a jump in the middle of it 117 OrChar = 0b1_00100_00, 118 Nop = 0b1_00101_00, //no operation (padding) 119 End = 0b1_00110_00, //end of program 120 Bol = 0b1_00111_00, //beginning of a line ^ 121 Eol = 0b1_01000_00, //end of a line $ 122 Wordboundary = 0b1_01001_00, //boundary of a word 123 Notwordboundary = 0b1_01010_00, //not a word boundary 124 Backref = 0b1_01011_00, //backreference to a group (that has to be pinned, i.e. locally unique) (group index) 125 GroupStart = 0b1_01100_00, //start of a group (x) (groupIndex+groupPinning(1bit)) 126 GroupEnd = 0b1_01101_00, //end of a group (x) (groupIndex+groupPinning(1bit)) 127 Option = 0b1_01110_00, //start of an option within an alternation x | y (length) 128 GotoEndOr = 0b1_01111_00, //end of an option (length of the rest) 129 Bof = 0b1_10000_00, //begining of "file" (string) ^ 130 Eof = 0b1_10001_00, //end of "file" (string) $ 131 //... any additional atoms here 132 133 OrStart = 0b1_00000_01, //start of alternation group (length) 134 OrEnd = 0b1_00000_10, //end of the or group (length,mergeIndex) 135 //with this instruction order 136 //bit mask 0b1_00001_00 could be used to test/set greediness 137 InfiniteStart = 0b1_00001_01, //start of an infinite repetition x* (length) 138 InfiniteEnd = 0b1_00001_10, //end of infinite repetition x* (length,mergeIndex) 139 InfiniteQStart = 0b1_00010_01, //start of a non eager infinite repetition x*? (length) 140 InfiniteQEnd = 0b1_00010_10, //end of non eager infinite repetition x*? (length,mergeIndex) 141 InfiniteBloomStart = 0b1_00011_01, //start of an filtered infinite repetition x* (length) 142 InfiniteBloomEnd = 0b1_00011_10, //end of filtered infinite repetition x* (length,mergeIndex) 143 RepeatStart = 0b1_00100_01, //start of a {n,m} repetition (length) 144 RepeatEnd = 0b1_00100_10, //end of x{n,m} repetition (length,step,minRep,maxRep) 145 RepeatQStart = 0b1_00101_01, //start of a non eager x{n,m}? repetition (length) 146 RepeatQEnd = 0b1_00101_10, //end of non eager x{n,m}? repetition (length,step,minRep,maxRep) 147 148 // 149 LookaheadStart = 0b1_00110_01, //begin of the lookahead group (length) 150 LookaheadEnd = 0b1_00110_10, //end of a lookahead group (length) 151 NeglookaheadStart = 0b1_00111_01, //start of a negative lookahead (length) 152 NeglookaheadEnd = 0b1_00111_10, //end of a negative lookahead (length) 153 LookbehindStart = 0b1_01000_01, //start of a lookbehind (length) 154 LookbehindEnd = 0b1_01000_10, //end of a lookbehind (length) 155 NeglookbehindStart = 0b1_01001_01, //start of a negative lookbehind (length) 156 NeglookbehindEnd = 0b1_01001_10, //end of negative lookbehind (length) 157 } 158 159 //a shorthand for IR length - full length of specific opcode evaluated at compile time 160 template IRL(IR code) 161 { 162 enum uint IRL = lengthOfIR(code); 163 } 164 static assert(IRL!(IR.LookaheadStart) == 3); 165 166 //how many parameters follow the IR, should be optimized fixing some IR bits 167 int immediateParamsIR(IR i) @safe pure nothrow @nogc 168 { 169 switch (i) 170 { 171 case IR.OrEnd,IR.InfiniteEnd,IR.InfiniteQEnd: 172 return 1; // merge table index 173 case IR.InfiniteBloomEnd: 174 return 2; // bloom filter index + merge table index 175 case IR.RepeatEnd, IR.RepeatQEnd: 176 return 4; 177 case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: 178 return 2; // start-end of captures used 179 default: 180 return 0; 181 } 182 } 183 184 //full length of IR instruction inlcuding all parameters that might follow it 185 int lengthOfIR(IR i) @safe pure nothrow @nogc 186 { 187 return 1 + immediateParamsIR(i); 188 } 189 190 //full length of the paired IR instruction inlcuding all parameters that might follow it 191 int lengthOfPairedIR(IR i) @safe pure nothrow @nogc 192 { 193 return 1 + immediateParamsIR(pairedIR(i)); 194 } 195 196 //if the operation has a merge point (this relies on the order of the ops) 197 bool hasMerge(IR i) @safe pure nothrow @nogc 198 { 199 return (i&0b11)==0b10 && i <= IR.RepeatQEnd; 200 } 201 202 //is an IR that opens a "group" 203 bool isStartIR(IR i) @safe pure nothrow @nogc 204 { 205 return (i&0b11)==0b01; 206 } 207 208 //is an IR that ends a "group" 209 bool isEndIR(IR i) @safe pure nothrow @nogc 210 { 211 return (i&0b11)==0b10; 212 } 213 214 //is a standalone IR 215 bool isAtomIR(IR i) @safe pure nothrow @nogc 216 { 217 return (i&0b11)==0b00; 218 } 219 220 //makes respective pair out of IR i, swapping start/end bits of instruction 221 IR pairedIR(IR i) @safe pure nothrow @nogc 222 { 223 assert(isStartIR(i) || isEndIR(i)); 224 return cast(IR) (i ^ 0b11); 225 } 226 227 //encoded IR instruction 228 @safe pure 229 struct Bytecode 230 { 231 uint raw; 232 //natural constraints 233 enum maxSequence = 2+4; 234 enum maxData = 1 << 22; 235 enum maxRaw = 1 << 31; 236 237 @safe pure: 238 this(IR code, uint data) 239 { 240 assert(data < (1 << 22) && code < 256); 241 raw = code << 24 | data; 242 } 243 244 this(IR code, uint data, uint seq) 245 { 246 assert(data < (1 << 22) && code < 256 ); 247 assert(seq >= 2 && seq < maxSequence); 248 raw = code << 24 | (seq - 2)<<22 | data; 249 } 250 251 //store raw data 252 static Bytecode fromRaw(uint data) 253 { 254 Bytecode t; 255 t.raw = data; 256 return t; 257 } 258 259 // bit twiddling helpers 260 // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985 261 @property uint data()() const { return raw & 0x003f_ffff; } 262 263 @property void data()(uint val) 264 { 265 raw = (raw & ~0x003f_ffff) | (val & 0x003f_ffff); 266 } 267 268 // ditto 269 // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985 270 @property uint sequence()() const { return 2 + (raw >> 22 & 0x3); } 271 272 // ditto 273 // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985 274 @property IR code()() const { return cast(IR)(raw >> 24); } 275 276 //ditto 277 @property bool hotspot() const { return hasMerge(code); } 278 279 //test the class of this instruction 280 @property bool isAtom() const { return isAtomIR(code); } 281 282 //ditto 283 @property bool isStart() const { return isStartIR(code); } 284 285 //ditto 286 @property bool isEnd() const { return isEndIR(code); } 287 288 //number of arguments for this instruction 289 @property int args() const { return immediateParamsIR(code); } 290 291 //mark this GroupStart or GroupEnd as referenced in backreference 292 void setBackrefence() 293 { 294 assert(code == IR.GroupStart || code == IR.GroupEnd); 295 raw = raw | 1 << 23; 296 } 297 298 //is referenced 299 @property bool backreference() const 300 { 301 assert(code == IR.GroupStart || code == IR.GroupEnd); 302 return cast(bool)(raw & 1 << 23); 303 } 304 305 //mark as local reference (for backrefs in lookarounds) 306 void setLocalRef() 307 { 308 assert(code == IR.Backref); 309 raw = raw | 1 << 23; 310 } 311 312 //is a local ref 313 @property bool localRef() const 314 { 315 assert(code == IR.Backref); 316 return cast(bool)(raw & 1 << 23); 317 } 318 319 //human readable name of instruction 320 @trusted @property string mnemonic()() const 321 {//@@@BUG@@@ to is @system 322 import std.conv : to; 323 return to!string(code); 324 } 325 326 //full length of instruction 327 @property uint length() const 328 { 329 return lengthOfIR(code); 330 } 331 332 //full length of respective start/end of this instruction 333 @property uint pairedLength() const 334 { 335 return lengthOfPairedIR(code); 336 } 337 338 //returns bytecode of paired instruction (assuming this one is start or end) 339 @property Bytecode paired() const 340 {//depends on bit and struct layout order 341 assert(isStart || isEnd); 342 return Bytecode.fromRaw(raw ^ 0b11 << 24); 343 } 344 345 //gets an index into IR block of the respective pair 346 uint indexOfPair(uint pc) const 347 { 348 assert(isStart || isEnd); 349 return isStart ? pc + data + length : pc - data - lengthOfPairedIR(code); 350 } 351 } 352 353 static assert(Bytecode.sizeof == 4); 354 355 356 //index entry structure for name --> number of submatch 357 struct NamedGroup 358 { 359 string name; 360 uint group; 361 } 362 363 //holds pair of start-end markers for a submatch 364 struct Group(DataIndex) 365 { 366 DataIndex begin = DataIndex.max; 367 DataIndex end = DataIndex.min; 368 369 bool opCast(T : bool)() const 370 { 371 return begin <= end; 372 } 373 374 @trusted string toString()() const 375 { 376 if (begin < end) 377 return "(unmatched)"; 378 import std.array : appender; 379 import std.format.write : formattedWrite; 380 auto a = appender!string(); 381 formattedWrite(a, "%s..%s", begin, end); 382 return a.data; 383 } 384 } 385 386 //debugging tool, prints out instruction along with opcodes 387 @trusted string disassemble(in Bytecode[] irb, uint pc, in NamedGroup[] dict=[]) 388 { 389 import std.array : appender; 390 import std.format.write : formattedWrite; 391 auto output = appender!string(); 392 formattedWrite(output,"%s", irb[pc].mnemonic); 393 switch (irb[pc].code) 394 { 395 case IR.Char: 396 formattedWrite(output, " %s (0x%x)",cast(dchar) irb[pc].data, irb[pc].data); 397 break; 398 case IR.OrChar: 399 formattedWrite(output, " %s (0x%x) seq=%d", cast(dchar) irb[pc].data, irb[pc].data, irb[pc].sequence); 400 break; 401 case IR.RepeatStart, IR.InfiniteStart, IR.InfiniteBloomStart, 402 IR.Option, IR.GotoEndOr, IR.OrStart: 403 //forward-jump instructions 404 uint len = irb[pc].data; 405 formattedWrite(output, " pc=>%u", pc+len+IRL!(IR.RepeatStart)); 406 break; 407 case IR.RepeatEnd, IR.RepeatQEnd: //backward-jump instructions 408 uint len = irb[pc].data; 409 formattedWrite(output, " pc=>%u min=%u max=%u step=%u", 410 pc - len, irb[pc + 3].raw, irb[pc + 4].raw, irb[pc + 2].raw); 411 break; 412 case IR.InfiniteEnd, IR.InfiniteQEnd, IR.InfiniteBloomEnd, IR.OrEnd: //ditto 413 uint len = irb[pc].data; 414 formattedWrite(output, " pc=>%u", pc-len); 415 break; 416 case IR.LookaheadEnd, IR.NeglookaheadEnd: //ditto 417 uint len = irb[pc].data; 418 formattedWrite(output, " pc=>%u", pc-len); 419 break; 420 case IR.GroupStart, IR.GroupEnd: 421 uint n = irb[pc].data; 422 string name; 423 foreach (v;dict) 424 if (v.group == n) 425 { 426 name = "'"~v.name~"'"; 427 break; 428 } 429 formattedWrite(output, " %s #%u " ~ (irb[pc].backreference ? "referenced" : ""), 430 name, n); 431 break; 432 case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: 433 uint len = irb[pc].data; 434 uint start = irb[pc+1].raw, end = irb[pc+2].raw; 435 formattedWrite(output, " pc=>%u [%u..%u]", pc + len + IRL!(IR.LookaheadStart), start, end); 436 break; 437 case IR.Backref: case IR.CodepointSet: case IR.Trie: 438 uint n = irb[pc].data; 439 formattedWrite(output, " %u", n); 440 if (irb[pc].code == IR.Backref) 441 formattedWrite(output, " %s", irb[pc].localRef ? "local" : "global"); 442 break; 443 default://all data-free instructions 444 } 445 if (irb[pc].hotspot) 446 formattedWrite(output, " Hotspot %u", irb[pc+1].raw); 447 return output.data; 448 } 449 450 //disassemble the whole chunk 451 @trusted void printBytecode()(in Bytecode[] slice, in NamedGroup[] dict=[]) 452 { 453 import std.stdio : writeln; 454 for (uint pc=0; pc<slice.length; pc += slice[pc].length) 455 writeln("\t", disassemble(slice, pc, dict)); 456 } 457 458 // Encapsulates memory management, explicit ref counting 459 // and the exact type of engine created 460 // there is a single instance per engine combination type x Char 461 // In future may also maintain a (TLS?) cache of memory 462 interface MatcherFactory(Char) 463 { 464 @safe: 465 Matcher!Char create(const ref Regex!Char, in Char[] input) const; 466 Matcher!Char dup(Matcher!Char m, in Char[] input) const; 467 size_t incRef(Matcher!Char m) const; 468 size_t decRef(Matcher!Char m) const; 469 } 470 471 // Only memory management, no compile-time vs run-time specialities 472 abstract class GenericFactory(alias EngineType, Char) : MatcherFactory!Char 473 { 474 import core.memory : pureFree; 475 import std.internal.memory : enforceMalloc; 476 import core.memory : GC; 477 // round up to next multiple of size_t for alignment purposes 478 enum classSize = (__traits(classInstanceSize, EngineType!Char) + size_t.sizeof - 1) & ~(size_t.sizeof - 1); 479 480 EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const; 481 482 override EngineType!Char create(const ref Regex!Char re, in Char[] input) const @trusted 483 { 484 immutable size = EngineType!Char.initialMemory(re) + classSize; 485 auto memory = enforceMalloc(size)[0 .. size]; 486 scope(failure) pureFree(memory.ptr); 487 GC.addRange(memory.ptr, classSize); 488 auto engine = construct(re, input, memory); 489 assert(engine.refCount == 1); 490 assert(cast(void*) engine == memory.ptr); 491 return engine; 492 } 493 494 override EngineType!Char dup(Matcher!Char engine, in Char[] input) const @trusted 495 { 496 immutable size = EngineType!Char.initialMemory(engine.pattern) + classSize; 497 auto memory = enforceMalloc(size)[0 .. size]; 498 scope(failure) pureFree(memory.ptr); 499 auto copy = construct(engine.pattern, input, memory); 500 GC.addRange(memory.ptr, classSize); 501 engine.dupTo(copy, memory[classSize .. size]); 502 assert(copy.refCount == 1); 503 return copy; 504 } 505 506 override size_t incRef(Matcher!Char m) const 507 { 508 return ++m.refCount; 509 } 510 511 override size_t decRef(Matcher!Char m) const @trusted 512 { 513 assert(m.refCount != 0); 514 auto cnt = --m.refCount; 515 if (cnt == 0) 516 { 517 void* ptr = cast(void*) m; 518 GC.removeRange(ptr); 519 pureFree(ptr); 520 } 521 return cnt; 522 } 523 } 524 525 // A factory for run-time engines 526 class RuntimeFactory(alias EngineType, Char) : GenericFactory!(EngineType, Char) 527 { 528 override EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const 529 { 530 import core.lifetime : emplace; 531 return emplace!(EngineType!Char)(memory[0 .. classSize], 532 re, Input!Char(input), memory[classSize .. $]); 533 } 534 } 535 536 // A factory for compile-time engine 537 class CtfeFactory(alias EngineType, Char, alias func) : GenericFactory!(EngineType, Char) 538 { 539 override EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const 540 { 541 import core.lifetime : emplace; 542 return emplace!(EngineType!Char)(memory[0 .. classSize], 543 re, &func, Input!Char(input), memory[classSize .. $]); 544 } 545 } 546 547 private auto defaultFactoryImpl(Char)(const ref Regex!Char re) 548 { 549 import std.regex.internal.backtracking : BacktrackingMatcher; 550 import std.regex.internal.thompson : ThompsonMatcher; 551 import std.algorithm.searching : canFind; 552 static MatcherFactory!Char backtrackingFactory; 553 static MatcherFactory!Char thompsonFactory; 554 if (re.backrefed.canFind!"a != 0") 555 { 556 if (backtrackingFactory is null) 557 backtrackingFactory = new RuntimeFactory!(BacktrackingMatcher, Char); 558 return backtrackingFactory; 559 } 560 else 561 { 562 if (thompsonFactory is null) 563 thompsonFactory = new RuntimeFactory!(ThompsonMatcher, Char); 564 return thompsonFactory; 565 } 566 } 567 568 // Used to generate a pure wrapper for defaultFactoryImpl. Based on the example in 569 // the std.traits.SetFunctionAttributes documentation. 570 auto assumePureFunction(T)(T t) 571 if (isFunctionPointer!T) 572 { 573 enum attrs = functionAttributes!T | FunctionAttribute.pure_; 574 return cast(SetFunctionAttributes!(T, functionLinkage!T, attrs)) t; 575 } 576 577 // A workaround for R-T enum re = regex(...) 578 template defaultFactory(Char) 579 { 580 // defaultFactory is constructed as a safe, pure wrapper over defaultFactoryImpl. 581 // It can be faked as pure because the static mutable variables are used to cache 582 // the key and character matcher. The technique used avoids delegates and GC. 583 @property MatcherFactory!Char defaultFactory(const ref Regex!Char re) @safe pure 584 { 585 static auto impl(const ref Regex!Char re) 586 { 587 return defaultFactoryImpl(re); 588 } 589 590 static auto pureImpl(const ref Regex!Char re) @trusted 591 { 592 auto p = assumePureFunction(&impl); 593 return p(re); 594 } 595 596 return pureImpl(re); 597 } 598 } 599 600 // Defining it as an interface has the undesired side-effect: 601 // casting any class to an interface silently adjusts pointer to point to a nested vtbl 602 abstract class Matcher(Char) 603 { 604 abstract: 605 // Get a (next) match 606 int match(Group!size_t[] matches) pure; 607 // This only maintains internal ref-count, 608 // deallocation happens inside MatcherFactory 609 @property ref size_t refCount() @safe; 610 // Copy internal state to another engine, using memory arena 'memory' 611 void dupTo(Matcher!Char m, void[] memory); 612 // The pattern loaded 613 @property ref const(Regex!Char) pattern() @safe; 614 // Re-arm the engine with new Input 615 Matcher rearm(in Char[] stream); 616 } 617 618 /++ 619 `Regex` object holds regular expression pattern in compiled form. 620 Instances of this object are constructed via calls to `regex`. 621 This is an intended form for caching and storage of frequently 622 used regular expressions. 623 +/ 624 struct Regex(Char) 625 { 626 //temporary workaround for identifier lookup 627 CodepointSet[] charsets; // 628 Bytecode[] ir; //compiled bytecode of pattern 629 630 631 @safe @property bool empty() const nothrow { return ir is null; } 632 /++ 633 `namedCaptures` returns a range of all named captures in a given regular expression. 634 +/ 635 @safe @property auto namedCaptures() 636 { 637 static struct NamedGroupRange 638 { 639 private: 640 const(NamedGroup)[] groups; 641 size_t start; 642 size_t end; 643 public: 644 this(const(NamedGroup)[] g, size_t s, size_t e) 645 { 646 assert(s <= e); 647 assert(e <= g.length); 648 groups = g; 649 start = s; 650 end = e; 651 } 652 653 @property string front() { return groups[start].name; } 654 @property string back() { return groups[end-1].name; } 655 @property bool empty() { return start >= end; } 656 @property size_t length() { return end - start; } 657 alias opDollar = length; 658 @property NamedGroupRange save() 659 { 660 return NamedGroupRange(groups, start, end); 661 } 662 void popFront() { assert(!empty); start++; } 663 void popBack() { assert(!empty); end--; } 664 string opIndex()(size_t i) 665 { 666 assert(start + i < end, 667 "Requested named group is out of range."); 668 return groups[start+i].name; 669 } 670 NamedGroupRange opSlice(size_t low, size_t high) { 671 assert(low <= high); 672 assert(start + high <= end); 673 return NamedGroupRange(groups, start + low, start + high); 674 } 675 NamedGroupRange opSlice() { return this.save; } 676 } 677 return NamedGroupRange(dict, 0, dict.length); 678 } 679 680 package(std.regex): 681 import std.regex.internal.kickstart : Kickstart; //TODO: get rid of this dependency 682 const(NamedGroup)[] dict; // maps name -> user group number 683 uint ngroup; // number of internal groups 684 uint maxCounterDepth; // max depth of nested {n,m} repetitions 685 uint hotspotTableSize; // number of entries in merge table 686 uint threadCount; // upper bound on number of Thompson VM threads 687 uint flags; // global regex flags 688 public const(CharMatcher)[] matchers; // tables that represent character sets 689 public const(BitTable)[] filters; // bloom filters for conditional loops 690 uint[] backrefed; // bit array of backreferenced submatches 691 Kickstart!Char kickstart; 692 MatcherFactory!Char factory; // produces optimal matcher for this pattern 693 immutable(Char)[] pattern; // copy of pattern to serve as cache key 694 695 const(Regex) withFactory(MatcherFactory!Char factory) pure const @trusted 696 { 697 auto r = cast() this; 698 r.factory = factory; 699 return r; 700 } 701 702 const(Regex) withFlags(uint newFlags) pure const @trusted 703 { 704 auto r = cast() this; 705 r.flags = newFlags; 706 return r; 707 } 708 709 const(Regex) withCode(const(Bytecode)[] code) pure const @trusted 710 { 711 auto r = cast() this; 712 r.ir = code.dup; // TODO: sidestep const instead? 713 return r; 714 } 715 716 const(Regex) withNGroup(uint nGroup) pure const @trusted 717 { 718 auto r = cast() this; 719 r.ngroup = nGroup; 720 return r; 721 } 722 723 //bit access helper 724 uint isBackref(uint n) 725 { 726 if (n/32 >= backrefed.length) 727 return 0; 728 return backrefed[n / 32] & (1 << (n & 31)); 729 } 730 731 //check if searching is not needed 732 void checkIfOneShot() 733 { 734 L_CheckLoop: 735 for (uint i = 0; i < ir.length; i += ir[i].length) 736 { 737 switch (ir[i].code) 738 { 739 case IR.Bof: 740 flags |= RegexInfo.oneShot; 741 break L_CheckLoop; 742 case IR.GroupStart, IR.GroupEnd, IR.Bol, IR.Eol, IR.Eof, 743 IR.Wordboundary, IR.Notwordboundary: 744 break; 745 default: 746 break L_CheckLoop; 747 } 748 } 749 } 750 751 //print out disassembly a program's IR 752 @trusted debug(std_regex_parser) void print() const 753 {//@@@BUG@@@ write is system 754 for (uint i = 0; i < ir.length; i += ir[i].length) 755 { 756 writefln("%d\t%s ", i, disassemble(ir, i, dict)); 757 } 758 writeln("Total merge table size: ", hotspotTableSize); 759 writeln("Max counter nesting depth: ", maxCounterDepth); 760 } 761 762 public string toString()() const 763 { 764 import std.format : format; 765 static if (is(typeof(pattern) : string)) 766 alias patternString = pattern; 767 else 768 { 769 import std.conv : to; 770 auto patternString = conv.to!string(pattern); 771 } 772 auto quotedEscapedPattern = format("%(%s %)", [patternString]); 773 auto flagString = regexOptionsToString(flags); 774 return "Regex!" ~ Char.stringof ~ "(" ~ quotedEscapedPattern ~ ", \"" ~ flagString ~ "\")"; 775 } 776 } 777 778 // The stuff below this point is temporarrily part of IR module 779 // but may need better place in the future (all internals) 780 package(std.regex): 781 782 //Simple UTF-string abstraction compatible with stream interface 783 struct Input(Char) 784 if (is(Char :dchar)) 785 { 786 import std.utf : decode; 787 alias DataIndex = size_t; 788 enum bool isLoopback = false; 789 alias String = const(Char)[]; 790 String _origin; 791 size_t _index; 792 793 //constructs Input object out of plain string 794 this(String input, size_t idx = 0) 795 { 796 _origin = input; 797 _index = idx; 798 } 799 800 //codepoint at current stream position 801 pragma(inline, true) bool nextChar(ref dchar res, ref size_t pos) 802 { 803 pos = _index; 804 // DMD's inliner hates multiple return functions 805 // but can live with single statement if/else bodies 806 bool n = !(_index == _origin.length); 807 if (n) 808 res = decode(_origin, _index); 809 return n; 810 } 811 @property bool atEnd(){ 812 return _index == _origin.length; 813 } 814 bool search(Kickstart)(ref const Kickstart kick, ref dchar res, ref size_t pos) 815 { 816 size_t idx = kick.search(_origin, _index); 817 _index = idx; 818 return nextChar(res, pos); 819 } 820 821 //index of at End position 822 @property size_t lastIndex(){ return _origin.length; } 823 824 //support for backtracker engine, might not be present 825 void reset(size_t index){ _index = index; } 826 827 String opSlice(size_t start, size_t end){ return _origin[start .. end]; } 828 829 auto loopBack(size_t index){ return BackLooper!Input(this, index); } 830 } 831 832 struct BackLooperImpl(Input) 833 { 834 import std.utf : strideBack; 835 alias DataIndex = size_t; 836 alias String = Input.String; 837 enum bool isLoopback = true; 838 String _origin; 839 size_t _index; 840 this(Input input, size_t index) 841 { 842 _origin = input._origin; 843 _index = index; 844 } 845 this(String input) 846 { 847 _origin = input; 848 _index = input.length; 849 } 850 @trusted bool nextChar(ref dchar res,ref size_t pos) 851 { 852 pos = _index; 853 if (_index == 0) 854 return false; 855 856 res = _origin[0.._index].back; 857 _index -= strideBack(_origin, _index); 858 859 return true; 860 } 861 @property atEnd(){ return _index == 0 || _index == strideBack(_origin, _index); } 862 auto loopBack(size_t index){ return Input(_origin, index); } 863 864 //support for backtracker engine, might not be present 865 //void reset(size_t index){ _index = index ? index-std.utf.strideBack(_origin, index) : 0; } 866 void reset(size_t index){ _index = index; } 867 868 String opSlice(size_t start, size_t end){ return _origin[end .. start]; } 869 //index of at End position 870 @property size_t lastIndex(){ return 0; } 871 } 872 873 template BackLooper(E) 874 { 875 static if (is(E : BackLooperImpl!U, U)) 876 { 877 alias BackLooper = U; 878 } 879 else 880 { 881 alias BackLooper = BackLooperImpl!E; 882 } 883 } 884 885 //both helpers below are internal, on its own are quite "explosive" 886 //unsafe, no initialization of elements 887 @system pure T[] mallocArray(T)(size_t len) 888 { 889 import core.memory : pureMalloc; 890 return (cast(T*) pureMalloc(len * T.sizeof))[0 .. len]; 891 } 892 893 //very unsafe, no initialization 894 @system T[] arrayInChunk(T)(size_t len, ref void[] chunk) 895 { 896 auto ret = (cast(T*) chunk.ptr)[0 .. len]; 897 chunk = chunk[len * T.sizeof .. $]; 898 return ret; 899 } 900 901 // 902 @trusted uint lookupNamedGroup(String)(const(NamedGroup)[] dict, String name) 903 {//equal is @system? 904 import std.algorithm.comparison : equal; 905 import std.algorithm.iteration : map; 906 import std.conv : text; 907 import std.range : assumeSorted; 908 909 auto fnd = assumeSorted!"cmp(a,b) < 0"(map!"a.name"(dict)).lowerBound(name).length; 910 enforce(fnd < dict.length && equal(dict[fnd].name, name), 911 text("no submatch named ", name)); 912 return dict[fnd].group; 913 } 914 915 // whether ch is one of unicode newline sequences 916 // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985 917 bool endOfLine()(dchar front, bool seenCr) 918 { 919 return ((front == '\n') ^ seenCr) || front == '\r' 920 || front == NEL || front == LS || front == PS; 921 } 922 923 // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985 924 bool startOfLine()(dchar back, bool seenNl) 925 { 926 return ((back == '\r') ^ seenNl) || back == '\n' 927 || back == NEL || back == LS || back == PS; 928 } 929 930 ///Exception object thrown in case of errors during regex compilation. 931 public class RegexException : Exception 932 { 933 mixin basicExceptionCtors; 934 } 935 936 // simple 128-entry bit-table used with a hash function 937 struct BitTable { 938 uint[4] filter; 939 940 this(CodepointSet set){ 941 foreach (iv; set.byInterval) 942 { 943 foreach (v; iv.a .. iv.b) 944 add(v); 945 } 946 } 947 948 void add()(dchar ch){ 949 immutable i = index(ch); 950 filter[i >> 5] |= 1<<(i & 31); 951 } 952 // non-zero -> might be present, 0 -> absent 953 bool opIndex()(dchar ch) const{ 954 immutable i = index(ch); 955 return (filter[i >> 5]>>(i & 31)) & 1; 956 } 957 958 static uint index()(dchar ch){ 959 return ((ch >> 7) ^ ch) & 0x7F; 960 } 961 } 962 963 struct CharMatcher { 964 BitTable ascii; // fast path for ASCII 965 Trie trie; // slow path for Unicode 966 967 this(CodepointSet set) 968 { 969 auto asciiSet = set & unicode.ASCII; 970 ascii = BitTable(asciiSet); 971 trie = makeTrie(set); 972 } 973 974 bool opIndex()(dchar ch) const 975 { 976 if (ch < 0x80) 977 return ascii[ch]; 978 else 979 return trie[ch]; 980 } 981 } 982 983 // Internal non-resizeble array, switches between inline storage and CoW 984 // POD-only 985 struct SmallFixedArray(T, uint SMALL=3) 986 if (!hasElaborateDestructor!T) 987 { 988 import std.internal.memory : enforceMalloc; 989 import core.memory : pureFree; 990 static struct Payload 991 { 992 size_t refcount; 993 T[0] placeholder; 994 inout(T)* ptr() inout { return placeholder.ptr; } 995 } 996 static assert(Payload.sizeof == size_t.sizeof); 997 union 998 { 999 Payload* big; 1000 T[SMALL] small; 1001 } 1002 size_t _sizeMask; 1003 enum BIG_MASK = size_t(1)<<(8*size_t.sizeof-1); 1004 enum SIZE_MASK = ~BIG_MASK; 1005 1006 @property bool isBig() const { return (_sizeMask & BIG_MASK) != 0; } 1007 @property size_t length() const { return _sizeMask & SIZE_MASK; } 1008 1009 this(size_t size) 1010 { 1011 if (size <= SMALL) 1012 { 1013 small[] = T.init; 1014 _sizeMask = size; 1015 } 1016 else 1017 { 1018 big = cast(Payload*) enforceMalloc(Payload.sizeof + T.sizeof*size); 1019 big.refcount = 1; 1020 _sizeMask = size | BIG_MASK; 1021 } 1022 } 1023 1024 private @trusted @property inout(T)[] internalSlice() inout 1025 { 1026 return isBig ? big.ptr[0 .. length] : small[0 .. length]; 1027 } 1028 1029 this(this) 1030 { 1031 if (isBig) 1032 { 1033 big.refcount++; 1034 } 1035 } 1036 1037 bool opEquals(SmallFixedArray a) 1038 { 1039 return internalSlice[] == a.internalSlice[]; 1040 } 1041 1042 size_t toHash() const 1043 { 1044 return hashOf(internalSlice[]); 1045 } 1046 1047 ref inout(T) opIndex(size_t idx) inout 1048 { 1049 return internalSlice[idx]; 1050 } 1051 1052 // accesses big to test self-referencing so not @safe 1053 @trusted ref opAssign(SmallFixedArray arr) 1054 { 1055 if (isBig) 1056 { 1057 if (arr.isBig) 1058 { 1059 if (big is arr.big) return this; // self-assign 1060 else 1061 { 1062 abandonRef(); 1063 _sizeMask = arr._sizeMask; 1064 big = arr.big; 1065 big.refcount++; 1066 } 1067 } 1068 else 1069 { 1070 abandonRef(); 1071 _sizeMask = arr._sizeMask; 1072 small = arr.small; 1073 } 1074 } 1075 else 1076 { 1077 if (arr.isBig) 1078 { 1079 _sizeMask = arr._sizeMask; 1080 big = arr.big; 1081 big.refcount++; 1082 } 1083 else 1084 { 1085 _sizeMask = arr._sizeMask; 1086 small = arr.small; 1087 } 1088 } 1089 return this; 1090 } 1091 1092 void mutate(scope void delegate(T[]) pure filler) 1093 { 1094 if (isBig && big.refcount != 1) // copy on write 1095 { 1096 auto oldSizeMask = _sizeMask; 1097 auto newbig = cast(Payload*) enforceMalloc(Payload.sizeof + T.sizeof*length); 1098 newbig.refcount = 1; 1099 abandonRef(); 1100 big = newbig; 1101 _sizeMask = oldSizeMask; 1102 } 1103 filler(internalSlice); 1104 } 1105 1106 ~this() 1107 { 1108 if (isBig) 1109 { 1110 abandonRef(); 1111 } 1112 } 1113 1114 @trusted private void abandonRef() 1115 { 1116 assert(isBig); 1117 if (--big.refcount == 0) 1118 { 1119 pureFree(big); 1120 _sizeMask = 0; 1121 assert(!isBig); 1122 } 1123 } 1124 } 1125 1126 @system unittest 1127 { 1128 alias SA = SmallFixedArray!(int, 2); 1129 SA create(int[] data) 1130 { 1131 SA a = SA(data.length); 1132 a.mutate((slice) { slice[] = data[]; }); 1133 assert(a.internalSlice == data); 1134 return a; 1135 } 1136 1137 { 1138 SA a; 1139 a = SA(1); 1140 assert(a.length == 1); 1141 a = SA.init; 1142 assert(a.length == 0); 1143 } 1144 1145 { 1146 SA a, b, c, d; 1147 assert(a.length == 0); 1148 assert(a.internalSlice == b.internalSlice); 1149 a = create([1]); 1150 assert(a.internalSlice == [1]); 1151 b = create([2, 3]); 1152 assert(b.internalSlice == [2, 3]); 1153 c = create([3, 4, 5]); 1154 d = create([5, 6, 7, 8]); 1155 assert(c.isBig); 1156 a = c; 1157 assert(a.isBig); 1158 assert(a.big is c.big); 1159 assert(a.big.refcount == 2); 1160 assert(a.internalSlice == [3, 4, 5]); 1161 assert(c.internalSlice == [3, 4, 5]); 1162 a = b; 1163 assert(!a.isBig); 1164 assert(a.internalSlice == [2, 3]); 1165 assert(c.big.refcount == 1); 1166 a = c; 1167 assert(c.big.refcount == 2); 1168 1169 // mutate copies on write if ref-count is not 1 1170 a.mutate((slice){ slice[] = 1; }); 1171 assert(a.internalSlice == [1, 1, 1]); 1172 assert(c.internalSlice == [3, 4, 5]); 1173 assert(a.isBig && c.isBig); 1174 assert(a.big.refcount == 1); 1175 assert(c.big.refcount == 1); 1176 1177 auto e = d; 1178 assert(e.big.refcount == 2); 1179 auto f = d; 1180 f = a; 1181 assert(f.isBig); 1182 assert(f.internalSlice == [1, 1, 1]); 1183 assert(f.big.refcount == 2); // a & f 1184 assert(e.big.refcount == 2); // d & e 1185 a = c; 1186 assert(f.big.refcount == 1); // f 1187 assert(e.big.refcount == 2); // d & e 1188 a = a; 1189 a = a; 1190 a = a; 1191 assert(a.big.refcount == 2); // a & c 1192 } 1193 } 1194