1 //Written in the D programming language 2 /* 3 Regular expression pattern parser. 4 */ 5 module std.regex.internal.parser; 6 7 import std.regex.internal.ir; 8 import std.range.primitives, std.uni, std.meta, 9 std.traits, std.typecons, std.exception; 10 static import std.ascii; 11 12 // package relevant info from parser into a regex object 13 auto makeRegex(S, CG)(Parser!(S, CG) p) 14 { 15 import std.regex.internal.backtracking : BacktrackingMatcher; 16 import std.regex.internal.thompson : ThompsonMatcher; 17 import std.algorithm.searching : canFind; 18 alias Char = BasicElementOf!S; 19 Regex!Char re; 20 auto g = p.g; 21 with(re) 22 { 23 ir = g.ir; 24 dict = g.dict; 25 ngroup = g.ngroup; 26 maxCounterDepth = g.counterDepth; 27 flags = p.re_flags; 28 charsets = g.charsets; 29 matchers = g.matchers; 30 backrefed = g.backrefed; 31 re.pattern = p.origin.idup; 32 re.postprocess(); 33 // check if we have backreferences, if so - use backtracking 34 if (__ctfe) factory = null; // allows us to use the awful enum re = regex(...); 35 else if (re.backrefed.canFind!"a != 0") 36 factory = new RuntimeFactory!(BacktrackingMatcher, Char); 37 else 38 factory = new RuntimeFactory!(ThompsonMatcher, Char); 39 debug(std_regex_parser) 40 { 41 __ctfe || print(); 42 } 43 //@@@BUG@@@ (not reduced) 44 //somehow just using validate _collides_ with std.utf.validate (!) 45 version (assert) re.validateRe(); 46 } 47 return re; 48 } 49 50 // helper for unittest 51 auto makeRegex(S)(S arg) 52 if (isSomeString!S) 53 { 54 return makeRegex(Parser!(S, CodeGen)(arg, "")); 55 } 56 57 @system unittest 58 { 59 import std.algorithm.comparison : equal; 60 auto re = makeRegex(`(?P<name>\w+) = (?P<var>\d+)`); 61 auto nc = re.namedCaptures; 62 static assert(isRandomAccessRange!(typeof(nc))); 63 assert(!nc.empty); 64 assert(nc.length == 2); 65 assert(nc.equal(["name", "var"])); 66 assert(nc[0] == "name"); 67 assert(nc[1..$].equal(["var"])); 68 69 re = makeRegex(`(\w+) (?P<named>\w+) (\w+)`); 70 nc = re.namedCaptures; 71 assert(nc.length == 1); 72 assert(nc[0] == "named"); 73 assert(nc.front == "named"); 74 assert(nc.back == "named"); 75 76 re = makeRegex(`(\w+) (\w+)`); 77 nc = re.namedCaptures; 78 assert(nc.empty); 79 80 re = makeRegex(`(?P<year>\d{4})/(?P<month>\d{2})/(?P<day>\d{2})/`); 81 nc = re.namedCaptures; 82 auto cp = nc.save; 83 assert(nc.equal(cp)); 84 nc.popFront(); 85 assert(nc.equal(cp[1..$])); 86 nc.popBack(); 87 assert(nc.equal(cp[1 .. $ - 1])); 88 } 89 90 91 @trusted void reverseBytecode()(Bytecode[] code) 92 { 93 Bytecode[] rev = new Bytecode[code.length]; 94 uint revPc = cast(uint) rev.length; 95 Stack!(Tuple!(uint, uint, uint)) stack; 96 uint start = 0; 97 uint end = cast(uint) code.length; 98 for (;;) 99 { 100 for (uint pc = start; pc < end; ) 101 { 102 immutable len = code[pc].length; 103 if (code[pc].code == IR.GotoEndOr) 104 break; //pick next alternation branch 105 if (code[pc].isAtom) 106 { 107 rev[revPc - len .. revPc] = code[pc .. pc + len]; 108 revPc -= len; 109 pc += len; 110 } 111 else if (code[pc].isStart || code[pc].isEnd) 112 { 113 //skip over other embedded lookbehinds they are reversed 114 if (code[pc].code == IR.LookbehindStart 115 || code[pc].code == IR.NeglookbehindStart) 116 { 117 immutable blockLen = len + code[pc].data 118 + code[pc].pairedLength; 119 rev[revPc - blockLen .. revPc] = code[pc .. pc + blockLen]; 120 pc += blockLen; 121 revPc -= blockLen; 122 continue; 123 } 124 immutable second = code[pc].indexOfPair(pc); 125 immutable secLen = code[second].length; 126 rev[revPc - secLen .. revPc] = code[second .. second + secLen]; 127 revPc -= secLen; 128 if (code[pc].code == IR.OrStart) 129 { 130 //we pass len bytes forward, but secLen in reverse 131 immutable revStart = revPc - (second + len - secLen - pc); 132 uint r = revStart; 133 uint i = pc + IRL!(IR.OrStart); 134 while (code[i].code == IR.Option) 135 { 136 if (code[i - 1].code != IR.OrStart) 137 { 138 assert(code[i - 1].code == IR.GotoEndOr); 139 rev[r - 1] = code[i - 1]; 140 } 141 rev[r] = code[i]; 142 auto newStart = i + IRL!(IR.Option); 143 auto newEnd = newStart + code[i].data; 144 auto newRpc = r + code[i].data + IRL!(IR.Option); 145 if (code[newEnd].code != IR.OrEnd) 146 { 147 newRpc--; 148 } 149 stack.push(tuple(newStart, newEnd, newRpc)); 150 r += code[i].data + IRL!(IR.Option); 151 i += code[i].data + IRL!(IR.Option); 152 } 153 pc = i; 154 revPc = revStart; 155 assert(code[pc].code == IR.OrEnd); 156 } 157 else 158 pc += len; 159 } 160 } 161 if (stack.empty) 162 break; 163 start = stack.top[0]; 164 end = stack.top[1]; 165 revPc = stack.top[2]; 166 stack.pop(); 167 } 168 code[] = rev[]; 169 } 170 171 struct CodeGen 172 { 173 Bytecode[] ir; // resulting bytecode 174 Stack!(uint) fixupStack; // stack of opened start instructions 175 NamedGroup[] dict; // maps name -> user group number 176 Stack!(uint) groupStack; // stack of current number of group 177 uint nesting = 0; // group nesting level and repetitions step 178 uint lookaroundNest = 0; // nesting of lookaround 179 uint counterDepth = 0; // current depth of nested counted repetitions 180 CodepointSet[] charsets; // sets for char classes 181 const(CharMatcher)[] matchers; // matchers for char classes 182 uint[] backrefed; // bitarray for groups refered by backref 183 uint ngroup; // final number of groups (of all patterns) 184 185 void start(uint length) 186 { 187 if (!__ctfe) 188 ir.reserve((length*5+2)/4); 189 fixupStack.push(0); 190 groupStack.push(1);//0 - whole match 191 } 192 193 //mark referenced groups for latter processing 194 void markBackref(uint n) 195 { 196 if (n/32 >= backrefed.length) 197 backrefed.length = n/32 + 1; 198 backrefed[n / 32] |= 1 << (n & 31); 199 } 200 201 bool isOpenGroup(uint n) 202 { 203 import std.algorithm.searching : canFind; 204 // walk the fixup stack and see if there are groups labeled 'n' 205 // fixup '0' is reserved for alternations 206 return fixupStack.data[1..$]. 207 canFind!(fix => ir[fix].code == IR.GroupStart && ir[fix].data == n)(); 208 } 209 210 void put(Bytecode code) 211 { 212 enforce(ir.length < maxCompiledLength, 213 "maximum compiled pattern length is exceeded"); 214 ir ~= code; 215 } 216 217 void putRaw(uint number) 218 { 219 enforce(ir.length < maxCompiledLength, 220 "maximum compiled pattern length is exceeded"); 221 ir ~= Bytecode.fromRaw(number); 222 } 223 224 //try to generate optimal IR code for this CodepointSet 225 @trusted void charsetToIr(CodepointSet set) 226 {//@@@BUG@@@ writeln is @system 227 uint chars = cast(uint) set.length; 228 if (chars < Bytecode.maxSequence) 229 { 230 switch (chars) 231 { 232 case 1: 233 put(Bytecode(IR.Char, set.byCodepoint.front)); 234 break; 235 case 0: 236 throw new RegexException("empty CodepointSet not allowed"); 237 default: 238 foreach (ch; set.byCodepoint) 239 put(Bytecode(IR.OrChar, ch, chars)); 240 } 241 } 242 else 243 { 244 import std.algorithm.searching : countUntil; 245 const ivals = set.byInterval; 246 immutable n = charsets.countUntil(set); 247 if (n >= 0) 248 { 249 if (ivals.length*2 > maxCharsetUsed) 250 put(Bytecode(IR.Trie, cast(uint) n)); 251 else 252 put(Bytecode(IR.CodepointSet, cast(uint) n)); 253 return; 254 } 255 if (ivals.length*2 > maxCharsetUsed) 256 { 257 auto t = getMatcher(set); 258 put(Bytecode(IR.Trie, cast(uint) matchers.length)); 259 matchers ~= t; 260 debug(std_regex_allocation) writeln("Trie generated"); 261 } 262 else 263 { 264 put(Bytecode(IR.CodepointSet, cast(uint) charsets.length)); 265 matchers ~= CharMatcher.init; 266 } 267 charsets ~= set; 268 assert(charsets.length == matchers.length); 269 } 270 } 271 272 void genLogicGroup() 273 { 274 nesting++; 275 pushFixup(length); 276 put(Bytecode(IR.Nop, 0)); 277 } 278 279 void genGroup() 280 { 281 nesting++; 282 pushFixup(length); 283 immutable nglob = groupStack.top++; 284 enforce(groupStack.top <= maxGroupNumber, "limit on number of submatches is exceeded"); 285 put(Bytecode(IR.GroupStart, nglob)); 286 } 287 288 void genNamedGroup(string name) 289 { 290 import std.array : insertInPlace; 291 import std.range : assumeSorted; 292 nesting++; 293 pushFixup(length); 294 immutable nglob = groupStack.top++; 295 enforce(groupStack.top <= maxGroupNumber, "limit on submatches is exceeded"); 296 auto t = NamedGroup(name, nglob); 297 auto d = assumeSorted!"a.name < b.name"(dict); 298 immutable ind = d.lowerBound(t).length; 299 insertInPlace(dict, ind, t); 300 put(Bytecode(IR.GroupStart, nglob)); 301 } 302 303 //generate code for start of lookaround: (?= (?! (?<= (?<! 304 void genLookaround(IR opcode) 305 { 306 nesting++; 307 pushFixup(length); 308 put(Bytecode(opcode, 0)); 309 put(Bytecode.fromRaw(0)); 310 put(Bytecode.fromRaw(0)); 311 groupStack.push(0); 312 lookaroundNest++; 313 enforce(lookaroundNest <= maxLookaroundDepth, 314 "maximum lookaround depth is exceeded"); 315 } 316 317 void endPattern(uint num) 318 { 319 import std.algorithm.comparison : max; 320 put(Bytecode(IR.End, num)); 321 ngroup = max(ngroup, groupStack.top); 322 groupStack.top = 1; // reset group counter 323 } 324 325 //fixup lookaround with start at offset fix and append a proper *-End opcode 326 void fixLookaround(uint fix) 327 { 328 lookaroundNest--; 329 ir[fix] = Bytecode(ir[fix].code, 330 cast(uint) ir.length - fix - IRL!(IR.LookaheadStart)); 331 auto g = groupStack.pop(); 332 assert(!groupStack.empty); 333 ir[fix+1] = Bytecode.fromRaw(groupStack.top); 334 //groups are cumulative across lookarounds 335 ir[fix+2] = Bytecode.fromRaw(groupStack.top+g); 336 groupStack.top += g; 337 if (ir[fix].code == IR.LookbehindStart || ir[fix].code == IR.NeglookbehindStart) 338 { 339 reverseBytecode(ir[fix + IRL!(IR.LookbehindStart) .. $]); 340 } 341 put(ir[fix].paired); 342 } 343 344 // repetition of {1,1} 345 void fixRepetition(uint offset) 346 { 347 import std.algorithm.mutation : copy; 348 immutable replace = ir[offset].code == IR.Nop; 349 if (replace) 350 { 351 copy(ir[offset + 1 .. $], ir[offset .. $ - 1]); 352 ir.length -= 1; 353 } 354 } 355 356 // repetition of {x,y} 357 void fixRepetition(uint offset, uint min, uint max, bool greedy) 358 { 359 static import std.algorithm.comparison; 360 import std.algorithm.mutation : copy; 361 import std.array : insertInPlace; 362 immutable replace = ir[offset].code == IR.Nop; 363 immutable len = cast(uint) ir.length - offset - replace; 364 if (max != infinite) 365 { 366 if (min != 1 || max != 1) 367 { 368 Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len); 369 if (replace) 370 ir[offset] = op; 371 else 372 insertInPlace(ir, offset, op); 373 put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len)); 374 put(Bytecode.init); //hotspot 375 putRaw(1); 376 putRaw(min); 377 putRaw(max); 378 counterDepth = std.algorithm.comparison.max(counterDepth, nesting+1); 379 } 380 } 381 else if (min) //&& max is infinite 382 { 383 if (min != 1) 384 { 385 Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len); 386 if (replace) 387 ir[offset] = op; 388 else 389 insertInPlace(ir, offset, op); 390 offset += 1;//so it still points to the repeated block 391 put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len)); 392 put(Bytecode.init); //hotspot 393 putRaw(1); 394 putRaw(min); 395 putRaw(min); 396 counterDepth = std.algorithm.comparison.max(counterDepth, nesting+1); 397 } 398 else if (replace) 399 { 400 copy(ir[offset+1 .. $], ir[offset .. $-1]); 401 ir.length -= 1; 402 } 403 put(Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len)); 404 enforce(ir.length + len < maxCompiledLength, "maximum compiled pattern length is exceeded"); 405 ir ~= ir[offset .. offset+len]; 406 //IR.InfinteX is always a hotspot 407 put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len)); 408 put(Bytecode.init); //merge index 409 } 410 else//vanila {0,inf} 411 { 412 Bytecode op = Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len); 413 if (replace) 414 ir[offset] = op; 415 else 416 insertInPlace(ir, offset, op); 417 //IR.InfinteX is always a hotspot 418 put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len)); 419 put(Bytecode.init); //merge index 420 } 421 } 422 423 void fixAlternation() 424 { 425 import std.array : insertInPlace; 426 uint fix = fixupStack.top; 427 if (ir.length > fix && ir[fix].code == IR.Option) 428 { 429 ir[fix] = Bytecode(ir[fix].code, cast(uint) ir.length - fix); 430 put(Bytecode(IR.GotoEndOr, 0)); 431 fixupStack.top = cast(uint) ir.length; //replace latest fixup for Option 432 put(Bytecode(IR.Option, 0)); 433 return; 434 } 435 uint len, orStart; 436 //start a new option 437 if (fixupStack.length == 1) 438 {//only root entry, effectively no fixup 439 len = cast(uint) ir.length + IRL!(IR.GotoEndOr); 440 orStart = 0; 441 } 442 else 443 {//IR.lookahead, etc. fixups that have length > 1, thus check ir[x].length 444 len = cast(uint) ir.length - fix - (ir[fix].length - 1); 445 orStart = fix + ir[fix].length; 446 } 447 insertInPlace(ir, orStart, Bytecode(IR.OrStart, 0), Bytecode(IR.Option, len)); 448 assert(ir[orStart].code == IR.OrStart); 449 put(Bytecode(IR.GotoEndOr, 0)); 450 fixupStack.push(orStart); //fixup for StartOR 451 fixupStack.push(cast(uint) ir.length); //for second Option 452 put(Bytecode(IR.Option, 0)); 453 } 454 455 // finalizes IR.Option, fix points to the first option of sequence 456 void finishAlternation(uint fix) 457 { 458 enforce(ir[fix].code == IR.Option, "no matching ')'"); 459 ir[fix] = Bytecode(ir[fix].code, cast(uint) ir.length - fix - IRL!(IR.OrStart)); 460 fix = fixupStack.pop(); 461 enforce(ir[fix].code == IR.OrStart, "no matching ')'"); 462 ir[fix] = Bytecode(IR.OrStart, cast(uint) ir.length - fix - IRL!(IR.OrStart)); 463 put(Bytecode(IR.OrEnd, cast(uint) ir.length - fix - IRL!(IR.OrStart))); 464 uint pc = fix + IRL!(IR.OrStart); 465 while (ir[pc].code == IR.Option) 466 { 467 pc = pc + ir[pc].data; 468 if (ir[pc].code != IR.GotoEndOr) 469 break; 470 ir[pc] = Bytecode(IR.GotoEndOr, cast(uint)(ir.length - pc - IRL!(IR.OrEnd))); 471 pc += IRL!(IR.GotoEndOr); 472 } 473 put(Bytecode.fromRaw(0)); 474 } 475 476 // returns: (flag - repetition possible?, fixup of the start of this "group") 477 Tuple!(bool, uint) onClose() 478 { 479 nesting--; 480 uint fix = popFixup(); 481 switch (ir[fix].code) 482 { 483 case IR.GroupStart: 484 put(Bytecode(IR.GroupEnd, ir[fix].data)); 485 return tuple(true, fix); 486 case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: 487 assert(lookaroundNest); 488 fixLookaround(fix); 489 return tuple(false, 0u); 490 case IR.Option: //| xxx ) 491 //two fixups: last option + full OR 492 finishAlternation(fix); 493 fix = topFixup; 494 switch (ir[fix].code) 495 { 496 case IR.GroupStart: 497 popFixup(); 498 put(Bytecode(IR.GroupEnd, ir[fix].data)); 499 return tuple(true, fix); 500 case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: 501 assert(lookaroundNest); 502 fix = popFixup(); 503 fixLookaround(fix); 504 return tuple(false, 0u); 505 default://(?:xxx) 506 popFixup(); 507 return tuple(true, fix); 508 } 509 default://(?:xxx) 510 return tuple(true, fix); 511 } 512 } 513 514 uint popFixup(){ return fixupStack.pop(); } 515 516 void pushFixup(uint val){ return fixupStack.push(val); } 517 518 @property uint topFixup(){ return fixupStack.top; } 519 520 @property size_t fixupLength(){ return fixupStack.data.length; } 521 522 @property uint length(){ return cast(uint) ir.length; } 523 } 524 525 // safety limits 526 enum maxGroupNumber = 2^^19; 527 enum maxLookaroundDepth = 16; 528 // *Bytecode.sizeof, i.e. 1Mb of bytecode alone 529 enum maxCompiledLength = 2^^18; 530 // amounts to up to 4 Mb of auxilary table for matching 531 enum maxCumulativeRepetitionLength = 2^^20; 532 // marker to indicate infinite repetition 533 enum infinite = ~0u; 534 535 struct Parser(R, Generator) 536 if (isForwardRange!R && is(ElementType!R : dchar)) 537 { 538 dchar front; 539 bool empty; 540 R pat, origin; //keep full pattern for pretty printing error messages 541 uint re_flags = 0; //global flags e.g. multiline + internal ones 542 Generator g; 543 544 @trusted this(S)(R pattern, S flags) 545 if (isSomeString!S) 546 { 547 pat = origin = pattern; 548 //reserve slightly more then avg as sampled from unittests 549 parseFlags(flags); 550 front = ' ';//a safe default for freeform parsing 551 popFront(); 552 g.start(cast(uint) pat.length); 553 try 554 { 555 parseRegex(); 556 } 557 catch (Exception e) 558 { 559 error(e.msg);//also adds pattern location 560 } 561 g.endPattern(1); 562 } 563 564 void _popFront() 565 { 566 if (pat.empty) 567 { 568 empty = true; 569 } 570 else 571 { 572 front = pat.front; 573 pat.popFront(); 574 } 575 } 576 577 void skipSpace() 578 { 579 while (!empty && isWhite(front)) _popFront(); 580 } 581 582 void popFront() 583 { 584 _popFront(); 585 if (re_flags & RegexOption.freeform) skipSpace(); 586 } 587 588 auto save(){ return this; } 589 590 //parsing number with basic overflow check 591 uint parseDecimal() 592 { 593 uint r = 0; 594 while (std.ascii.isDigit(front)) 595 { 596 if (r >= (uint.max/10)) 597 error("Overflow in decimal number"); 598 r = 10*r + cast(uint)(front-'0'); 599 popFront(); 600 if (empty) break; 601 } 602 return r; 603 } 604 605 // 606 @trusted void parseFlags(S)(S flags) 607 {//@@@BUG@@@ text is @system 608 import std.conv : text; 609 foreach (ch; flags)//flags are ASCII anyway 610 { 611 L_FlagSwitch: 612 switch (ch) 613 { 614 615 foreach (i, op; __traits(allMembers, RegexOption)) 616 { 617 case RegexOptionNames[i]: 618 if (re_flags & mixin("RegexOption."~op)) 619 throw new RegexException(text("redundant flag specified: ",ch)); 620 re_flags |= mixin("RegexOption."~op); 621 break L_FlagSwitch; 622 } 623 default: 624 throw new RegexException(text("unknown regex flag '",ch,"'")); 625 } 626 } 627 } 628 629 //parse and store IR for regex pattern 630 @trusted void parseRegex() 631 { 632 uint fix;//fixup pointer 633 634 while (!empty) 635 { 636 debug(std_regex_parser) 637 __ctfe || writeln("*LR*\nSource: ", pat, "\nStack: ",fixupStack.data); 638 switch (front) 639 { 640 case '(': 641 popFront(); 642 if (front == '?') 643 { 644 popFront(); 645 switch (front) 646 { 647 case '#': 648 for (;;) 649 { 650 popFront(); 651 enforce(!empty, "Unexpected end of pattern"); 652 if (front == ')') 653 { 654 popFront(); 655 break; 656 } 657 } 658 break; 659 case ':': 660 g.genLogicGroup(); 661 popFront(); 662 break; 663 case '=': 664 g.genLookaround(IR.LookaheadStart); 665 popFront(); 666 break; 667 case '!': 668 g.genLookaround(IR.NeglookaheadStart); 669 popFront(); 670 break; 671 case 'P': 672 popFront(); 673 enforce(front == '<', "Expected '<' in named group"); 674 string name; 675 popFront(); 676 if (empty || !(isAlpha(front) || front == '_')) 677 error("Expected alpha starting a named group"); 678 name ~= front; 679 popFront(); 680 while (!empty && (isAlpha(front) || 681 front == '_' || std.ascii.isDigit(front))) 682 { 683 name ~= front; 684 popFront(); 685 } 686 enforce(front == '>', "Expected '>' closing named group"); 687 popFront(); 688 g.genNamedGroup(name); 689 break; 690 case '<': 691 popFront(); 692 if (front == '=') 693 g.genLookaround(IR.LookbehindStart); 694 else if (front == '!') 695 g.genLookaround(IR.NeglookbehindStart); 696 else 697 error("'!' or '=' expected after '<'"); 698 popFront(); 699 break; 700 default: 701 uint enableFlags, disableFlags; 702 bool enable = true; 703 do 704 { 705 switch (front) 706 { 707 case 's': 708 if (enable) 709 enableFlags |= RegexOption.singleline; 710 else 711 disableFlags |= RegexOption.singleline; 712 break; 713 case 'x': 714 if (enable) 715 enableFlags |= RegexOption.freeform; 716 else 717 disableFlags |= RegexOption.freeform; 718 break; 719 case 'i': 720 if (enable) 721 enableFlags |= RegexOption.casefold; 722 else 723 disableFlags |= RegexOption.casefold; 724 break; 725 case 'm': 726 if (enable) 727 enableFlags |= RegexOption.multiline; 728 else 729 disableFlags |= RegexOption.multiline; 730 break; 731 case '-': 732 if (!enable) 733 error(" unexpected second '-' in flags"); 734 enable = false; 735 break; 736 default: 737 error(" 's', 'x', 'i', 'm' or '-' expected after '(?' "); 738 } 739 popFront(); 740 }while (front != ')'); 741 popFront(); 742 re_flags |= enableFlags; 743 re_flags &= ~disableFlags; 744 } 745 } 746 else 747 { 748 g.genGroup(); 749 } 750 break; 751 case ')': 752 enforce(g.nesting, "Unmatched ')'"); 753 popFront(); 754 auto pair = g.onClose(); 755 if (pair[0]) 756 parseQuantifier(pair[1]); 757 break; 758 case '|': 759 popFront(); 760 g.fixAlternation(); 761 break; 762 default://no groups or whatever 763 immutable start = g.length; 764 parseAtom(); 765 parseQuantifier(start); 766 } 767 } 768 769 if (g.fixupLength != 1) 770 { 771 fix = g.popFixup(); 772 g.finishAlternation(fix); 773 enforce(g.fixupLength == 1, "no matching ')'"); 774 } 775 } 776 777 778 //parse and store IR for atom-quantifier pair 779 @trusted void parseQuantifier(uint offset) 780 {//copy is @system 781 if (empty) 782 return g.fixRepetition(offset); 783 uint min, max; 784 switch (front) 785 { 786 case '*': 787 min = 0; 788 max = infinite; 789 break; 790 case '?': 791 min = 0; 792 max = 1; 793 break; 794 case '+': 795 min = 1; 796 max = infinite; 797 break; 798 case '{': 799 popFront(); 800 enforce(!empty, "Unexpected end of regex pattern"); 801 enforce(std.ascii.isDigit(front), "First number required in repetition"); 802 min = parseDecimal(); 803 if (front == '}') 804 max = min; 805 else if (front == ',') 806 { 807 popFront(); 808 if (std.ascii.isDigit(front)) 809 max = parseDecimal(); 810 else if (front == '}') 811 max = infinite; 812 else 813 error("Unexpected symbol in regex pattern"); 814 skipSpace(); 815 enforce(front == '}', "Unmatched '{' in regex pattern"); 816 } 817 else 818 error("Unexpected symbol in regex pattern"); 819 enforce(min <= max, "Illegal {n,m} quantifier"); 820 break; 821 default: 822 g.fixRepetition(offset); 823 return; 824 } 825 bool greedy = true; 826 //check only if we managed to get new symbol 827 popFront(); 828 if (!empty && front == '?') 829 { 830 greedy = false; 831 popFront(); 832 } 833 g.fixRepetition(offset, min, max, greedy); 834 } 835 836 //parse and store IR for atom 837 void parseAtom() 838 { 839 if (empty) 840 return; 841 switch (front) 842 { 843 case '*', '?', '+', '|', '{', '}': 844 return error("'*', '+', '?', '{', '}' not allowed in atom"); 845 case '.': 846 if (re_flags & RegexOption.singleline) 847 g.put(Bytecode(IR.Any, 0)); 848 else 849 { 850 CodepointSet set; 851 g.charsetToIr(set.add('\n','\n'+1).add('\r', '\r'+1).inverted); 852 } 853 popFront(); 854 break; 855 case '[': 856 parseCharset(); 857 break; 858 case '\\': 859 _popFront(); 860 enforce(!empty, "Unfinished escape sequence"); 861 parseEscape(); 862 break; 863 case '^': 864 if (re_flags & RegexOption.multiline) 865 g.put(Bytecode(IR.Bol, 0)); 866 else 867 g.put(Bytecode(IR.Bof, 0)); 868 popFront(); 869 break; 870 case '$': 871 if (re_flags & RegexOption.multiline) 872 g.put(Bytecode(IR.Eol, 0)); 873 else 874 g.put(Bytecode(IR.Eof, 0)); 875 popFront(); 876 break; 877 default: 878 if (re_flags & RegexOption.casefold) 879 { 880 auto range = simpleCaseFoldings(front); 881 assert(range.length <= 5); 882 if (range.length == 1) 883 g.put(Bytecode(IR.Char, range.front)); 884 else 885 foreach (v; range) 886 g.put(Bytecode(IR.OrChar, v, cast(uint) range.length)); 887 } 888 else 889 g.put(Bytecode(IR.Char, front)); 890 popFront(); 891 } 892 } 893 894 //parse and store IR for CodepointSet 895 void parseCharset() 896 { 897 const save = re_flags; 898 re_flags &= ~RegexOption.freeform; // stop ignoring whitespace if we did 899 bool casefold = cast(bool)(re_flags & RegexOption.casefold); 900 g.charsetToIr(unicode.parseSet(this, casefold)); 901 re_flags = save; 902 // Last next() in parseCharset is executed w/o freeform flag 903 if (re_flags & RegexOption.freeform) skipSpace(); 904 } 905 906 //parse and generate IR for escape stand alone escape sequence 907 @trusted void parseEscape() 908 {//accesses array of appender 909 import std.algorithm.iteration : sum; 910 switch (front) 911 { 912 case 'f': popFront(); g.put(Bytecode(IR.Char, '\f')); break; 913 case 'n': popFront(); g.put(Bytecode(IR.Char, '\n')); break; 914 case 'r': popFront(); g.put(Bytecode(IR.Char, '\r')); break; 915 case 't': popFront(); g.put(Bytecode(IR.Char, '\t')); break; 916 case 'v': popFront(); g.put(Bytecode(IR.Char, '\v')); break; 917 918 case 'd': 919 popFront(); 920 g.charsetToIr(unicode.Nd); 921 break; 922 case 'D': 923 popFront(); 924 g.charsetToIr(unicode.Nd.inverted); 925 break; 926 case 'b': popFront(); g.put(Bytecode(IR.Wordboundary, 0)); break; 927 case 'B': popFront(); g.put(Bytecode(IR.Notwordboundary, 0)); break; 928 case 's': 929 popFront(); 930 g.charsetToIr(unicode.White_Space); 931 break; 932 case 'S': 933 popFront(); 934 g.charsetToIr(unicode.White_Space.inverted); 935 break; 936 case 'w': 937 popFront(); 938 g.charsetToIr(wordCharacter); 939 break; 940 case 'W': 941 popFront(); 942 g.charsetToIr(wordCharacter.inverted); 943 break; 944 case 'p': case 'P': 945 bool casefold = cast(bool)(re_flags & RegexOption.casefold); 946 auto set = unicode.parsePropertySpec(this, front == 'P', casefold); 947 g.charsetToIr(set); 948 break; 949 case 'x': 950 immutable code = parseUniHex(pat, 2); 951 popFront(); 952 g.put(Bytecode(IR.Char,code)); 953 break; 954 case 'u': case 'U': 955 immutable code = parseUniHex(pat, front == 'u' ? 4 : 8); 956 popFront(); 957 g.put(Bytecode(IR.Char, code)); 958 break; 959 case 'c': //control codes 960 Bytecode code = Bytecode(IR.Char, unicode.parseControlCode(this)); 961 popFront(); 962 g.put(code); 963 break; 964 case '0': 965 popFront(); 966 g.put(Bytecode(IR.Char, 0));//NUL character 967 break; 968 case '1': .. case '9': 969 uint nref = cast(uint) front - '0'; 970 immutable maxBackref = sum(g.groupStack.data); 971 enforce(nref < maxBackref, "Backref to unseen group"); 972 //perl's disambiguation rule i.e. 973 //get next digit only if there is such group number 974 popFront(); 975 while (nref < maxBackref && !empty && std.ascii.isDigit(front)) 976 { 977 nref = nref * 10 + front - '0'; 978 popFront(); 979 } 980 if (nref >= maxBackref) 981 nref /= 10; 982 enforce(!g.isOpenGroup(nref), "Backref to open group"); 983 uint localLimit = maxBackref - g.groupStack.top; 984 if (nref >= localLimit) 985 { 986 g.put(Bytecode(IR.Backref, nref-localLimit)); 987 g.ir[$-1].setLocalRef(); 988 } 989 else 990 g.put(Bytecode(IR.Backref, nref)); 991 g.markBackref(nref); 992 break; 993 default: 994 if (front == '\\' && !pat.empty) 995 { 996 if (pat.front >= privateUseStart && pat.front <= privateUseEnd) 997 enforce(false, "invalid escape sequence"); 998 } 999 if (front >= privateUseStart && front <= privateUseEnd) 1000 { 1001 g.endPattern(front - privateUseStart + 1); 1002 break; 1003 } 1004 auto op = Bytecode(IR.Char, front); 1005 popFront(); 1006 g.put(op); 1007 } 1008 } 1009 1010 // 1011 @trusted void error(string msg) 1012 { 1013 import std.array : appender; 1014 import std.format.write : formattedWrite; 1015 auto app = appender!string(); 1016 formattedWrite(app, "%s\nPattern with error: `%s` <--HERE-- `%s`", 1017 msg, origin[0..$-pat.length], pat); 1018 throw new RegexException(app.data); 1019 } 1020 1021 alias Char = BasicElementOf!R; 1022 1023 @property program() 1024 { 1025 return makeRegex(this); 1026 } 1027 } 1028 1029 /+ 1030 Postproces the IR, then optimize. 1031 +/ 1032 @trusted void postprocess(Char)(ref Regex!Char zis) 1033 {//@@@BUG@@@ write is @system 1034 with(zis) 1035 { 1036 struct FixedStack(T) 1037 { 1038 T[] arr; 1039 uint _top; 1040 //this(T[] storage){ arr = storage; _top = -1; } 1041 @property ref T top(){ assert(!empty); return arr[_top]; } 1042 void push(T x){ arr[++_top] = x; } 1043 T pop() { assert(!empty); return arr[_top--]; } 1044 @property bool empty(){ return _top == -1; } 1045 } 1046 auto counterRange = FixedStack!uint(new uint[maxCounterDepth+1], -1); 1047 counterRange.push(1); 1048 ulong cumRange = 0; 1049 for (uint i = 0; i < ir.length; i += ir[i].length) 1050 { 1051 if (ir[i].hotspot) 1052 { 1053 assert(i + 1 < ir.length, 1054 "unexpected end of IR while looking for hotspot"); 1055 ir[i+1] = Bytecode.fromRaw(hotspotTableSize); 1056 hotspotTableSize += counterRange.top; 1057 } 1058 switch (ir[i].code) 1059 { 1060 case IR.RepeatStart, IR.RepeatQStart: 1061 uint repEnd = cast(uint)(i + ir[i].data + IRL!(IR.RepeatStart)); 1062 assert(ir[repEnd].code == ir[i].paired.code); 1063 immutable max = ir[repEnd + 4].raw; 1064 ir[repEnd+2].raw = counterRange.top; 1065 ir[repEnd+3].raw *= counterRange.top; 1066 ir[repEnd+4].raw *= counterRange.top; 1067 ulong cntRange = cast(ulong)(max)*counterRange.top; 1068 cumRange += cntRange; 1069 enforce(cumRange < maxCumulativeRepetitionLength, 1070 "repetition length limit is exceeded"); 1071 counterRange.push(cast(uint) cntRange + counterRange.top); 1072 threadCount += counterRange.top; 1073 break; 1074 case IR.RepeatEnd, IR.RepeatQEnd: 1075 threadCount += counterRange.top; 1076 counterRange.pop(); 1077 break; 1078 case IR.GroupStart: 1079 if (isBackref(ir[i].data)) 1080 ir[i].setBackrefence(); 1081 threadCount += counterRange.top; 1082 break; 1083 case IR.GroupEnd: 1084 if (isBackref(ir[i].data)) 1085 ir[i].setBackrefence(); 1086 threadCount += counterRange.top; 1087 break; 1088 default: 1089 threadCount += counterRange.top; 1090 } 1091 } 1092 checkIfOneShot(); 1093 if (!(flags & RegexInfo.oneShot)) 1094 kickstart = Kickstart!Char(zis, new uint[](256)); 1095 debug(std_regex_allocation) writefln("IR processed, max threads: %d", threadCount); 1096 optimize(zis); 1097 } 1098 } 1099 1100 void fixupBytecode()(Bytecode[] ir) 1101 { 1102 Stack!uint fixups; 1103 1104 with(IR) for (uint i=0; i<ir.length; i+= ir[i].length) 1105 { 1106 if (ir[i].isStart || ir[i].code == Option) 1107 fixups.push(i); 1108 else if (ir[i].code == OrEnd) 1109 { 1110 // Alternatives need more care 1111 auto j = fixups.pop(); // last Option 1112 ir[j].data = i - j - ir[j].length; 1113 j = fixups.pop(); // OrStart 1114 ir[j].data = i - j - ir[j].length; 1115 ir[i].data = ir[j].data; 1116 1117 // fixup all GotoEndOrs 1118 j = j + IRL!(OrStart); 1119 assert(ir[j].code == Option); 1120 for (;;) 1121 { 1122 auto next = j + ir[j].data + IRL!(Option); 1123 if (ir[next].code == IR.OrEnd) 1124 break; 1125 ir[next - IRL!(GotoEndOr)].data = i - next; 1126 j = next; 1127 } 1128 } 1129 else if (ir[i].code == GotoEndOr) 1130 { 1131 auto j = fixups.pop(); // Option 1132 ir[j].data = i - j + IRL!(GotoEndOr)- IRL!(Option); // to the next option 1133 } 1134 else if (ir[i].isEnd) 1135 { 1136 auto j = fixups.pop(); 1137 ir[i].data = i - j - ir[j].length; 1138 ir[j].data = ir[i].data; 1139 } 1140 } 1141 assert(fixups.empty); 1142 } 1143 1144 void optimize(Char)(ref Regex!Char zis) 1145 { 1146 import std.array : insertInPlace; 1147 CodepointSet nextSet(uint idx) 1148 { 1149 CodepointSet set; 1150 with(zis) with(IR) 1151 Outer: 1152 for (uint i = idx; i < ir.length; i += ir[i].length) 1153 { 1154 switch (ir[i].code) 1155 { 1156 case Char: 1157 set.add(ir[i].data, ir[i].data+1); 1158 goto default; 1159 //TODO: OrChar 1160 case Trie, CodepointSet: 1161 set = zis.charsets[ir[i].data]; 1162 goto default; 1163 case GroupStart,GroupEnd: 1164 break; 1165 default: 1166 break Outer; 1167 } 1168 } 1169 return set; 1170 } 1171 1172 with(zis) with(IR) for (uint i = 0; i < ir.length; i += ir[i].length) 1173 { 1174 if (ir[i].code == InfiniteEnd) 1175 { 1176 auto set = nextSet(i+IRL!(InfiniteEnd)); 1177 if (!set.empty && set.length < 10_000) 1178 { 1179 ir[i] = Bytecode(InfiniteBloomEnd, ir[i].data); 1180 ir[i - ir[i].data - IRL!(InfiniteStart)] = 1181 Bytecode(InfiniteBloomStart, ir[i].data); 1182 ir.insertInPlace(i+IRL!(InfiniteEnd), 1183 Bytecode.fromRaw(cast(uint) zis.filters.length)); 1184 zis.filters ~= BitTable(set); 1185 fixupBytecode(ir); 1186 } 1187 } 1188 } 1189 } 1190 1191 //IR code validator - proper nesting, illegal instructions, etc. 1192 @trusted void validateRe(Char)(ref Regex!Char zis) 1193 {//@@@BUG@@@ text is @system 1194 import std.conv : text; 1195 with(zis) 1196 { 1197 for (uint pc = 0; pc < ir.length; pc += ir[pc].length) 1198 { 1199 if (ir[pc].isStart || ir[pc].isEnd) 1200 { 1201 immutable dest = ir[pc].indexOfPair(pc); 1202 assert(dest < ir.length, text("Wrong length in opcode at pc=", 1203 pc, " ", dest, " vs ", ir.length)); 1204 assert(ir[dest].paired == ir[pc], 1205 text("Wrong pairing of opcodes at pc=", pc, "and pc=", dest)); 1206 } 1207 else if (ir[pc].isAtom) 1208 { 1209 1210 } 1211 else 1212 assert(0, text("Unknown type of instruction at pc=", pc)); 1213 } 1214 } 1215 } 1216