1 //Written in the D programming language 2 /* 3 Regular expression pattern parser. 4 */ 5 module std.regex.internal.parser; 6 7 static import std.ascii; 8 import std.range.primitives, std.uni, std.meta, 9 std.traits, std.typecons, std.exception; 10 import std.regex.internal.ir; 11 12 // package relevant info from parser into a regex object 13 auto makeRegex(S, CG)(Parser!(S, CG) p) 14 { 15 Regex!(BasicElementOf!S) re; 16 auto g = p.g; 17 with(re) 18 { 19 ir = g.ir; 20 dict = g.dict; 21 ngroup = g.ngroup; 22 maxCounterDepth = g.counterDepth; 23 flags = p.re_flags; 24 charsets = g.charsets; 25 matchers = g.matchers; 26 backrefed = g.backrefed; 27 re.postprocess(); 28 debug(std_regex_parser) 29 { 30 __ctfe || print(); 31 } 32 //@@@BUG@@@ (not reduced) 33 //somehow just using validate _collides_ with std.utf.validate (!) 34 version (assert) re.validateRe(); 35 } 36 return re; 37 } 38 39 // helper for unittest 40 auto makeRegex(S)(S arg) 41 if (isSomeString!S) 42 { 43 return makeRegex(Parser!(S, CodeGen)(arg, "")); 44 } 45 46 @system unittest 47 { 48 import std.algorithm.comparison : equal; 49 auto re = makeRegex(`(?P<name>\w+) = (?P<var>\d+)`); 50 auto nc = re.namedCaptures; 51 static assert(isRandomAccessRange!(typeof(nc))); 52 assert(!nc.empty); 53 assert(nc.length == 2); 54 assert(nc.equal(["name", "var"])); 55 assert(nc[0] == "name"); 56 assert(nc[1..$].equal(["var"])); 57 58 re = makeRegex(`(\w+) (?P<named>\w+) (\w+)`); 59 nc = re.namedCaptures; 60 assert(nc.length == 1); 61 assert(nc[0] == "named"); 62 assert(nc.front == "named"); 63 assert(nc.back == "named"); 64 65 re = makeRegex(`(\w+) (\w+)`); 66 nc = re.namedCaptures; 67 assert(nc.empty); 68 69 re = makeRegex(`(?P<year>\d{4})/(?P<month>\d{2})/(?P<day>\d{2})/`); 70 nc = re.namedCaptures; 71 auto cp = nc.save; 72 assert(nc.equal(cp)); 73 nc.popFront(); 74 assert(nc.equal(cp[1..$])); 75 nc.popBack(); 76 assert(nc.equal(cp[1 .. $ - 1])); 77 } 78 79 80 @trusted void reverseBytecode()(Bytecode[] code) 81 { 82 Bytecode[] rev = new Bytecode[code.length]; 83 uint revPc = cast(uint) rev.length; 84 Stack!(Tuple!(uint, uint, uint)) stack; 85 uint start = 0; 86 uint end = cast(uint) code.length; 87 for (;;) 88 { 89 for (uint pc = start; pc < end; ) 90 { 91 immutable len = code[pc].length; 92 if (code[pc].code == IR.GotoEndOr) 93 break; //pick next alternation branch 94 if (code[pc].isAtom) 95 { 96 rev[revPc - len .. revPc] = code[pc .. pc + len]; 97 revPc -= len; 98 pc += len; 99 } 100 else if (code[pc].isStart || code[pc].isEnd) 101 { 102 //skip over other embedded lookbehinds they are reversed 103 if (code[pc].code == IR.LookbehindStart 104 || code[pc].code == IR.NeglookbehindStart) 105 { 106 immutable blockLen = len + code[pc].data 107 + code[pc].pairedLength; 108 rev[revPc - blockLen .. revPc] = code[pc .. pc + blockLen]; 109 pc += blockLen; 110 revPc -= blockLen; 111 continue; 112 } 113 immutable second = code[pc].indexOfPair(pc); 114 immutable secLen = code[second].length; 115 rev[revPc - secLen .. revPc] = code[second .. second + secLen]; 116 revPc -= secLen; 117 if (code[pc].code == IR.OrStart) 118 { 119 //we pass len bytes forward, but secLen in reverse 120 immutable revStart = revPc - (second + len - secLen - pc); 121 uint r = revStart; 122 uint i = pc + IRL!(IR.OrStart); 123 while (code[i].code == IR.Option) 124 { 125 if (code[i - 1].code != IR.OrStart) 126 { 127 assert(code[i - 1].code == IR.GotoEndOr); 128 rev[r - 1] = code[i - 1]; 129 } 130 rev[r] = code[i]; 131 auto newStart = i + IRL!(IR.Option); 132 auto newEnd = newStart + code[i].data; 133 auto newRpc = r + code[i].data + IRL!(IR.Option); 134 if (code[newEnd].code != IR.OrEnd) 135 { 136 newRpc--; 137 } 138 stack.push(tuple(newStart, newEnd, newRpc)); 139 r += code[i].data + IRL!(IR.Option); 140 i += code[i].data + IRL!(IR.Option); 141 } 142 pc = i; 143 revPc = revStart; 144 assert(code[pc].code == IR.OrEnd); 145 } 146 else 147 pc += len; 148 } 149 } 150 if (stack.empty) 151 break; 152 start = stack.top[0]; 153 end = stack.top[1]; 154 revPc = stack.top[2]; 155 stack.pop(); 156 } 157 code[] = rev[]; 158 } 159 160 //test if a given string starts with hex number of maxDigit that's a valid codepoint 161 //returns it's value and skips these maxDigit chars on success, throws on failure 162 dchar parseUniHex(Char)(ref Char[] str, size_t maxDigit) 163 { 164 //std.conv.parse is both @system and bogus 165 enforce(str.length >= maxDigit,"incomplete escape sequence"); 166 uint val; 167 for (int k = 0; k < maxDigit; k++) 168 { 169 immutable current = str[k];//accepts ascii only, so it's OK to index directly 170 if ('0' <= current && current <= '9') 171 val = val * 16 + current - '0'; 172 else if ('a' <= current && current <= 'f') 173 val = val * 16 + current -'a' + 10; 174 else if ('A' <= current && current <= 'F') 175 val = val * 16 + current - 'A' + 10; 176 else 177 throw new Exception("invalid escape sequence"); 178 } 179 enforce(val <= 0x10FFFF, "invalid codepoint"); 180 str = str[maxDigit..$]; 181 return val; 182 } 183 184 @system unittest //BUG canFind is system 185 { 186 import std.algorithm.searching : canFind; 187 string[] non_hex = [ "000j", "000z", "FffG", "0Z"]; 188 string[] hex = [ "01", "ff", "00af", "10FFFF" ]; 189 int[] value = [ 1, 0xFF, 0xAF, 0x10FFFF ]; 190 foreach (v; non_hex) 191 assert(collectException(parseUniHex(v, v.length)).msg 192 .canFind("invalid escape sequence")); 193 foreach (i, v; hex) 194 assert(parseUniHex(v, v.length) == value[i]); 195 string over = "0011FFFF"; 196 assert(collectException(parseUniHex(over, over.length)).msg 197 .canFind("invalid codepoint")); 198 } 199 200 auto caseEnclose(CodepointSet set) 201 { 202 auto cased = set & unicode.LC; 203 foreach (dchar ch; cased.byCodepoint) 204 { 205 foreach (c; simpleCaseFoldings(ch)) 206 set |= c; 207 } 208 return set; 209 } 210 211 /+ 212 fetch codepoint set corresponding to a name (InBlock or binary property) 213 +/ 214 @trusted CodepointSet getUnicodeSet(in char[] name, bool negated, bool casefold) 215 { 216 CodepointSet s = unicode(name); 217 //FIXME: caseEnclose for new uni as Set | CaseEnclose(SET && LC) 218 if (casefold) 219 s = caseEnclose(s); 220 if (negated) 221 s = s.inverted; 222 return s; 223 } 224 225 //basic stack, just in case it gets used anywhere else then Parser 226 @trusted struct Stack(T) 227 { 228 T[] data; 229 @property bool empty(){ return data.empty; } 230 231 @property size_t length(){ return data.length; } 232 233 void push(T val){ data ~= val; } 234 235 T pop() 236 { 237 assert(!empty); 238 auto val = data[$ - 1]; 239 data = data[0 .. $ - 1]; 240 if (!__ctfe) 241 cast(void) data.assumeSafeAppend(); 242 return val; 243 } 244 245 @property ref T top() 246 { 247 assert(!empty); 248 return data[$ - 1]; 249 } 250 } 251 252 struct CodeGen 253 { 254 Bytecode[] ir; // resulting bytecode 255 Stack!(uint) fixupStack; // stack of opened start instructions 256 NamedGroup[] dict; // maps name -> user group number 257 Stack!(uint) groupStack; // stack of current number of group 258 uint nesting = 0; // group nesting level and repetitions step 259 uint lookaroundNest = 0; // nesting of lookaround 260 uint counterDepth = 0; // current depth of nested counted repetitions 261 CodepointSet[] charsets; // sets for char classes 262 const(CharMatcher)[] matchers; // matchers for char classes 263 uint[] backrefed; // bitarray for groups refered by backref 264 uint ngroup; // final number of groups (of all patterns) 265 266 void start(uint length) 267 { 268 if (!__ctfe) 269 ir.reserve((length*5+2)/4); 270 fixupStack.push(0); 271 groupStack.push(1);//0 - whole match 272 } 273 274 //mark referenced groups for latter processing 275 void markBackref(uint n) 276 { 277 if (n/32 >= backrefed.length) 278 backrefed.length = n/32 + 1; 279 backrefed[n / 32] |= 1 << (n & 31); 280 } 281 282 bool isOpenGroup(uint n) 283 { 284 import std.algorithm.searching : canFind; 285 // walk the fixup stack and see if there are groups labeled 'n' 286 // fixup '0' is reserved for alternations 287 return fixupStack.data[1..$]. 288 canFind!(fix => ir[fix].code == IR.GroupStart && ir[fix].data == n)(); 289 } 290 291 void put(Bytecode code) 292 { 293 enforce(ir.length < maxCompiledLength, 294 "maximum compiled pattern length is exceeded"); 295 ir ~= code; 296 } 297 298 void putRaw(uint number) 299 { 300 enforce(ir.length < maxCompiledLength, 301 "maximum compiled pattern length is exceeded"); 302 ir ~= Bytecode.fromRaw(number); 303 } 304 305 //try to generate optimal IR code for this CodepointSet 306 @trusted void charsetToIr(CodepointSet set) 307 {//@@@BUG@@@ writeln is @system 308 uint chars = cast(uint) set.length; 309 if (chars < Bytecode.maxSequence) 310 { 311 switch (chars) 312 { 313 case 1: 314 put(Bytecode(IR.Char, set.byCodepoint.front)); 315 break; 316 case 0: 317 throw new RegexException("empty CodepointSet not allowed"); 318 default: 319 foreach (ch; set.byCodepoint) 320 put(Bytecode(IR.OrChar, ch, chars)); 321 } 322 } 323 else 324 { 325 import std.algorithm.searching : countUntil; 326 const ivals = set.byInterval; 327 immutable n = charsets.countUntil(set); 328 if (n >= 0) 329 { 330 if (ivals.length*2 > maxCharsetUsed) 331 put(Bytecode(IR.Trie, cast(uint) n)); 332 else 333 put(Bytecode(IR.CodepointSet, cast(uint) n)); 334 return; 335 } 336 if (ivals.length*2 > maxCharsetUsed) 337 { 338 auto t = getMatcher(set); 339 put(Bytecode(IR.Trie, cast(uint) matchers.length)); 340 matchers ~= t; 341 debug(std_regex_allocation) writeln("Trie generated"); 342 } 343 else 344 { 345 put(Bytecode(IR.CodepointSet, cast(uint) charsets.length)); 346 matchers ~= CharMatcher.init; 347 } 348 charsets ~= set; 349 assert(charsets.length == matchers.length); 350 } 351 } 352 353 void genLogicGroup() 354 { 355 nesting++; 356 pushFixup(length); 357 put(Bytecode(IR.Nop, 0)); 358 } 359 360 void genGroup() 361 { 362 nesting++; 363 pushFixup(length); 364 immutable nglob = groupStack.top++; 365 enforce(groupStack.top <= maxGroupNumber, "limit on number of submatches is exceeded"); 366 put(Bytecode(IR.GroupStart, nglob)); 367 } 368 369 void genNamedGroup(string name) 370 { 371 import std.array : insertInPlace; 372 import std.range : assumeSorted; 373 nesting++; 374 pushFixup(length); 375 immutable nglob = groupStack.top++; 376 enforce(groupStack.top <= maxGroupNumber, "limit on submatches is exceeded"); 377 auto t = NamedGroup(name, nglob); 378 auto d = assumeSorted!"a.name < b.name"(dict); 379 immutable ind = d.lowerBound(t).length; 380 insertInPlace(dict, ind, t); 381 put(Bytecode(IR.GroupStart, nglob)); 382 } 383 384 //generate code for start of lookaround: (?= (?! (?<= (?<! 385 void genLookaround(IR opcode) 386 { 387 nesting++; 388 pushFixup(length); 389 put(Bytecode(opcode, 0)); 390 put(Bytecode.fromRaw(0)); 391 put(Bytecode.fromRaw(0)); 392 groupStack.push(0); 393 lookaroundNest++; 394 enforce(lookaroundNest <= maxLookaroundDepth, 395 "maximum lookaround depth is exceeded"); 396 } 397 398 void endPattern(uint num) 399 { 400 import std.algorithm.comparison : max; 401 put(Bytecode(IR.End, num)); 402 ngroup = max(ngroup, groupStack.top); 403 groupStack.top = 1; // reset group counter 404 } 405 406 //fixup lookaround with start at offset fix and append a proper *-End opcode 407 void fixLookaround(uint fix) 408 { 409 lookaroundNest--; 410 ir[fix] = Bytecode(ir[fix].code, 411 cast(uint) ir.length - fix - IRL!(IR.LookaheadStart)); 412 auto g = groupStack.pop(); 413 assert(!groupStack.empty); 414 ir[fix+1] = Bytecode.fromRaw(groupStack.top); 415 //groups are cumulative across lookarounds 416 ir[fix+2] = Bytecode.fromRaw(groupStack.top+g); 417 groupStack.top += g; 418 if (ir[fix].code == IR.LookbehindStart || ir[fix].code == IR.NeglookbehindStart) 419 { 420 reverseBytecode(ir[fix + IRL!(IR.LookbehindStart) .. $]); 421 } 422 put(ir[fix].paired); 423 } 424 425 // repetition of {1,1} 426 void fixRepetition(uint offset) 427 { 428 import std.algorithm.mutation : copy; 429 immutable replace = ir[offset].code == IR.Nop; 430 if (replace) 431 { 432 copy(ir[offset + 1 .. $], ir[offset .. $ - 1]); 433 ir.length -= 1; 434 } 435 } 436 437 // repetition of {x,y} 438 void fixRepetition(uint offset, uint min, uint max, bool greedy) 439 { 440 static import std.algorithm.comparison; 441 import std.algorithm.mutation : copy; 442 import std.array : insertInPlace; 443 immutable replace = ir[offset].code == IR.Nop; 444 immutable len = cast(uint) ir.length - offset - replace; 445 if (max != infinite) 446 { 447 if (min != 1 || max != 1) 448 { 449 Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len); 450 if (replace) 451 ir[offset] = op; 452 else 453 insertInPlace(ir, offset, op); 454 put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len)); 455 put(Bytecode.init); //hotspot 456 putRaw(1); 457 putRaw(min); 458 putRaw(max); 459 counterDepth = std.algorithm.comparison.max(counterDepth, nesting+1); 460 } 461 } 462 else if (min) //&& max is infinite 463 { 464 if (min != 1) 465 { 466 Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len); 467 if (replace) 468 ir[offset] = op; 469 else 470 insertInPlace(ir, offset, op); 471 offset += 1;//so it still points to the repeated block 472 put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len)); 473 put(Bytecode.init); //hotspot 474 putRaw(1); 475 putRaw(min); 476 putRaw(min); 477 counterDepth = std.algorithm.comparison.max(counterDepth, nesting+1); 478 } 479 else if (replace) 480 { 481 copy(ir[offset+1 .. $], ir[offset .. $-1]); 482 ir.length -= 1; 483 } 484 put(Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len)); 485 enforce(ir.length + len < maxCompiledLength, "maximum compiled pattern length is exceeded"); 486 ir ~= ir[offset .. offset+len]; 487 //IR.InfinteX is always a hotspot 488 put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len)); 489 put(Bytecode.init); //merge index 490 } 491 else//vanila {0,inf} 492 { 493 Bytecode op = Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len); 494 if (replace) 495 ir[offset] = op; 496 else 497 insertInPlace(ir, offset, op); 498 //IR.InfinteX is always a hotspot 499 put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len)); 500 put(Bytecode.init); //merge index 501 } 502 } 503 504 void fixAlternation() 505 { 506 import std.array : insertInPlace; 507 uint fix = fixupStack.top; 508 if (ir.length > fix && ir[fix].code == IR.Option) 509 { 510 ir[fix] = Bytecode(ir[fix].code, cast(uint) ir.length - fix); 511 put(Bytecode(IR.GotoEndOr, 0)); 512 fixupStack.top = cast(uint) ir.length; //replace latest fixup for Option 513 put(Bytecode(IR.Option, 0)); 514 return; 515 } 516 uint len, orStart; 517 //start a new option 518 if (fixupStack.length == 1) 519 {//only root entry, effectively no fixup 520 len = cast(uint) ir.length + IRL!(IR.GotoEndOr); 521 orStart = 0; 522 } 523 else 524 {//IR.lookahead, etc. fixups that have length > 1, thus check ir[x].length 525 len = cast(uint) ir.length - fix - (ir[fix].length - 1); 526 orStart = fix + ir[fix].length; 527 } 528 insertInPlace(ir, orStart, Bytecode(IR.OrStart, 0), Bytecode(IR.Option, len)); 529 assert(ir[orStart].code == IR.OrStart); 530 put(Bytecode(IR.GotoEndOr, 0)); 531 fixupStack.push(orStart); //fixup for StartOR 532 fixupStack.push(cast(uint) ir.length); //for second Option 533 put(Bytecode(IR.Option, 0)); 534 } 535 536 // finalizes IR.Option, fix points to the first option of sequence 537 void finishAlternation(uint fix) 538 { 539 enforce(ir[fix].code == IR.Option, "no matching ')'"); 540 ir[fix] = Bytecode(ir[fix].code, cast(uint) ir.length - fix - IRL!(IR.OrStart)); 541 fix = fixupStack.pop(); 542 enforce(ir[fix].code == IR.OrStart, "no matching ')'"); 543 ir[fix] = Bytecode(IR.OrStart, cast(uint) ir.length - fix - IRL!(IR.OrStart)); 544 put(Bytecode(IR.OrEnd, cast(uint) ir.length - fix - IRL!(IR.OrStart))); 545 uint pc = fix + IRL!(IR.OrStart); 546 while (ir[pc].code == IR.Option) 547 { 548 pc = pc + ir[pc].data; 549 if (ir[pc].code != IR.GotoEndOr) 550 break; 551 ir[pc] = Bytecode(IR.GotoEndOr, cast(uint)(ir.length - pc - IRL!(IR.OrEnd))); 552 pc += IRL!(IR.GotoEndOr); 553 } 554 put(Bytecode.fromRaw(0)); 555 } 556 557 // returns: (flag - repetition possible?, fixup of the start of this "group") 558 Tuple!(bool, uint) onClose() 559 { 560 nesting--; 561 uint fix = popFixup(); 562 switch (ir[fix].code) 563 { 564 case IR.GroupStart: 565 put(Bytecode(IR.GroupEnd, ir[fix].data)); 566 return tuple(true, fix); 567 case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: 568 assert(lookaroundNest); 569 fixLookaround(fix); 570 return tuple(false, 0u); 571 case IR.Option: //| xxx ) 572 //two fixups: last option + full OR 573 finishAlternation(fix); 574 fix = topFixup; 575 switch (ir[fix].code) 576 { 577 case IR.GroupStart: 578 popFixup(); 579 put(Bytecode(IR.GroupEnd, ir[fix].data)); 580 return tuple(true, fix); 581 case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: 582 assert(lookaroundNest); 583 fix = popFixup(); 584 fixLookaround(fix); 585 return tuple(false, 0u); 586 default://(?:xxx) 587 popFixup(); 588 return tuple(true, fix); 589 } 590 default://(?:xxx) 591 return tuple(true, fix); 592 } 593 } 594 595 uint popFixup(){ return fixupStack.pop(); } 596 597 void pushFixup(uint val){ return fixupStack.push(val); } 598 599 @property uint topFixup(){ return fixupStack.top; } 600 601 @property size_t fixupLength(){ return fixupStack.data.length; } 602 603 @property uint length(){ return cast(uint) ir.length; } 604 } 605 606 // safety limits 607 enum maxGroupNumber = 2^^19; 608 enum maxLookaroundDepth = 16; 609 // *Bytecode.sizeof, i.e. 1Mb of bytecode alone 610 enum maxCompiledLength = 2^^18; 611 // amounts to up to 4 Mb of auxilary table for matching 612 enum maxCumulativeRepetitionLength = 2^^20; 613 // marker to indicate infinite repetition 614 enum infinite = ~0u; 615 616 struct Parser(R, Generator) 617 if (isForwardRange!R && is(ElementType!R : dchar)) 618 { 619 dchar _current; 620 bool empty; 621 R pat, origin; //keep full pattern for pretty printing error messages 622 uint re_flags = 0; //global flags e.g. multiline + internal ones 623 Generator g; 624 625 @trusted this(S)(R pattern, S flags) 626 if (isSomeString!S) 627 { 628 pat = origin = pattern; 629 //reserve slightly more then avg as sampled from unittests 630 parseFlags(flags); 631 _current = ' ';//a safe default for freeform parsing 632 next(); 633 g.start(cast(uint) pat.length); 634 try 635 { 636 parseRegex(); 637 } 638 catch (Exception e) 639 { 640 error(e.msg);//also adds pattern location 641 } 642 g.endPattern(1); 643 } 644 645 @property dchar current(){ return _current; } 646 647 bool _next() 648 { 649 if (pat.empty) 650 { 651 empty = true; 652 return false; 653 } 654 _current = pat.front; 655 pat.popFront(); 656 return true; 657 } 658 659 void skipSpace() 660 { 661 while (isWhite(current) && _next()){ } 662 } 663 664 bool next() 665 { 666 if (re_flags & RegexOption.freeform) 667 { 668 immutable r = _next(); 669 skipSpace(); 670 return r; 671 } 672 else 673 return _next(); 674 } 675 676 //parsing number with basic overflow check 677 uint parseDecimal() 678 { 679 uint r = 0; 680 while (std.ascii.isDigit(current)) 681 { 682 if (r >= (uint.max/10)) 683 error("Overflow in decimal number"); 684 r = 10*r + cast(uint)(current-'0'); 685 if (!next()) 686 break; 687 } 688 return r; 689 } 690 691 //parse control code of form \cXXX, c assumed to be the current symbol 692 dchar parseControlCode() 693 { 694 enforce(next(), "Unfinished escape sequence"); 695 enforce(('a' <= current && current <= 'z') || ('A' <= current && current <= 'Z'), 696 "Only letters are allowed after \\c"); 697 return current & 0x1f; 698 } 699 700 // 701 @trusted void parseFlags(S)(S flags) 702 {//@@@BUG@@@ text is @system 703 import std.conv : text; 704 foreach (ch; flags)//flags are ASCII anyway 705 { 706 L_FlagSwitch: 707 switch (ch) 708 { 709 710 foreach (i, op; __traits(allMembers, RegexOption)) 711 { 712 case RegexOptionNames[i]: 713 if (re_flags & mixin("RegexOption."~op)) 714 throw new RegexException(text("redundant flag specified: ",ch)); 715 re_flags |= mixin("RegexOption."~op); 716 break L_FlagSwitch; 717 } 718 default: 719 throw new RegexException(text("unknown regex flag '",ch,"'")); 720 } 721 } 722 } 723 724 //parse and store IR for regex pattern 725 @trusted void parseRegex() 726 { 727 uint fix;//fixup pointer 728 729 while (!empty) 730 { 731 debug(std_regex_parser) 732 __ctfe || writeln("*LR*\nSource: ", pat, "\nStack: ",fixupStack.data); 733 switch (current) 734 { 735 case '(': 736 next(); 737 if (current == '?') 738 { 739 next(); 740 switch (current) 741 { 742 case '#': 743 for (;;) 744 { 745 if (!next()) 746 error("Unexpected end of pattern"); 747 if (current == ')') 748 { 749 next(); 750 break; 751 } 752 } 753 break; 754 case ':': 755 g.genLogicGroup(); 756 next(); 757 break; 758 case '=': 759 g.genLookaround(IR.LookaheadStart); 760 next(); 761 break; 762 case '!': 763 g.genLookaround(IR.NeglookaheadStart); 764 next(); 765 break; 766 case 'P': 767 next(); 768 if (current != '<') 769 error("Expected '<' in named group"); 770 string name; 771 if (!next() || !(isAlpha(current) || current == '_')) 772 error("Expected alpha starting a named group"); 773 name ~= current; 774 while (next() && (isAlpha(current) || 775 current == '_' || std.ascii.isDigit(current))) 776 { 777 name ~= current; 778 } 779 if (current != '>') 780 error("Expected '>' closing named group"); 781 next(); 782 g.genNamedGroup(name); 783 break; 784 case '<': 785 next(); 786 if (current == '=') 787 g.genLookaround(IR.LookbehindStart); 788 else if (current == '!') 789 g.genLookaround(IR.NeglookbehindStart); 790 else 791 error("'!' or '=' expected after '<'"); 792 next(); 793 break; 794 default: 795 uint enableFlags, disableFlags; 796 bool enable = true; 797 do 798 { 799 switch (current) 800 { 801 case 's': 802 if (enable) 803 enableFlags |= RegexOption.singleline; 804 else 805 disableFlags |= RegexOption.singleline; 806 break; 807 case 'x': 808 if (enable) 809 enableFlags |= RegexOption.freeform; 810 else 811 disableFlags |= RegexOption.freeform; 812 break; 813 case 'i': 814 if (enable) 815 enableFlags |= RegexOption.casefold; 816 else 817 disableFlags |= RegexOption.casefold; 818 break; 819 case 'm': 820 if (enable) 821 enableFlags |= RegexOption.multiline; 822 else 823 disableFlags |= RegexOption.multiline; 824 break; 825 case '-': 826 if (!enable) 827 error(" unexpected second '-' in flags"); 828 enable = false; 829 break; 830 default: 831 error(" 's', 'x', 'i', 'm' or '-' expected after '(?' "); 832 } 833 next(); 834 }while (current != ')'); 835 next(); 836 re_flags |= enableFlags; 837 re_flags &= ~disableFlags; 838 } 839 } 840 else 841 { 842 g.genGroup(); 843 } 844 break; 845 case ')': 846 enforce(g.nesting, "Unmatched ')'"); 847 next(); 848 auto pair = g.onClose(); 849 if (pair[0]) 850 parseQuantifier(pair[1]); 851 break; 852 case '|': 853 next(); 854 g.fixAlternation(); 855 break; 856 default://no groups or whatever 857 immutable start = g.length; 858 parseAtom(); 859 parseQuantifier(start); 860 } 861 } 862 863 if (g.fixupLength != 1) 864 { 865 fix = g.popFixup(); 866 g.finishAlternation(fix); 867 enforce(g.fixupLength == 1, "no matching ')'"); 868 } 869 } 870 871 872 //parse and store IR for atom-quantifier pair 873 @trusted void parseQuantifier(uint offset) 874 {//copy is @system 875 if (empty) 876 return g.fixRepetition(offset); 877 uint min, max; 878 switch (current) 879 { 880 case '*': 881 min = 0; 882 max = infinite; 883 break; 884 case '?': 885 min = 0; 886 max = 1; 887 break; 888 case '+': 889 min = 1; 890 max = infinite; 891 break; 892 case '{': 893 enforce(next(), "Unexpected end of regex pattern"); 894 enforce(std.ascii.isDigit(current), "First number required in repetition"); 895 min = parseDecimal(); 896 if (current == '}') 897 max = min; 898 else if (current == ',') 899 { 900 next(); 901 if (std.ascii.isDigit(current)) 902 max = parseDecimal(); 903 else if (current == '}') 904 max = infinite; 905 else 906 error("Unexpected symbol in regex pattern"); 907 skipSpace(); 908 if (current != '}') 909 error("Unmatched '{' in regex pattern"); 910 } 911 else 912 error("Unexpected symbol in regex pattern"); 913 if (min > max) 914 error("Illegal {n,m} quantifier"); 915 break; 916 default: 917 g.fixRepetition(offset); 918 return; 919 } 920 bool greedy = true; 921 //check only if we managed to get new symbol 922 if (next() && current == '?') 923 { 924 greedy = false; 925 next(); 926 } 927 g.fixRepetition(offset, min, max, greedy); 928 } 929 930 //parse and store IR for atom 931 void parseAtom() 932 { 933 if (empty) 934 return; 935 switch (current) 936 { 937 case '*', '?', '+', '|', '{', '}': 938 error("'*', '+', '?', '{', '}' not allowed in atom"); 939 break; 940 case '.': 941 if (re_flags & RegexOption.singleline) 942 g.put(Bytecode(IR.Any, 0)); 943 else 944 { 945 CodepointSet set; 946 g.charsetToIr(set.add('\n','\n'+1).add('\r', '\r'+1).inverted); 947 } 948 next(); 949 break; 950 case '[': 951 parseCharset(); 952 break; 953 case '\\': 954 enforce(_next(), "Unfinished escape sequence"); 955 parseEscape(); 956 break; 957 case '^': 958 if (re_flags & RegexOption.multiline) 959 g.put(Bytecode(IR.Bol, 0)); 960 else 961 g.put(Bytecode(IR.Bof, 0)); 962 next(); 963 break; 964 case '$': 965 if (re_flags & RegexOption.multiline) 966 g.put(Bytecode(IR.Eol, 0)); 967 else 968 g.put(Bytecode(IR.Eof, 0)); 969 next(); 970 break; 971 default: 972 //FIXME: getCommonCasing in new std uni 973 if (re_flags & RegexOption.casefold) 974 { 975 auto range = simpleCaseFoldings(current); 976 assert(range.length <= 5); 977 if (range.length == 1) 978 g.put(Bytecode(IR.Char, range.front)); 979 else 980 foreach (v; range) 981 g.put(Bytecode(IR.OrChar, v, cast(uint) range.length)); 982 } 983 else 984 g.put(Bytecode(IR.Char, current)); 985 next(); 986 } 987 } 988 989 990 991 //CodepointSet operations relatively in order of priority 992 enum Operator:uint { 993 Open = 0, Negate, Difference, SymDifference, Intersection, Union, None 994 } 995 996 //parse unit of CodepointSet spec, most notably escape sequences and char ranges 997 //also fetches next set operation 998 Tuple!(CodepointSet,Operator) parseCharTerm() 999 { 1000 enum State{ Start, Char, Escape, CharDash, CharDashEscape, 1001 PotentialTwinSymbolOperator } 1002 Operator op = Operator.None; 1003 dchar last; 1004 CodepointSet set; 1005 State state = State.Start; 1006 1007 static void addWithFlags(ref CodepointSet set, uint ch, uint re_flags) 1008 { 1009 if (re_flags & RegexOption.casefold) 1010 { 1011 auto range = simpleCaseFoldings(ch); 1012 foreach (v; range) 1013 set |= v; 1014 } 1015 else 1016 set |= ch; 1017 } 1018 1019 static Operator twinSymbolOperator(dchar symbol) 1020 { 1021 switch (symbol) 1022 { 1023 case '|': 1024 return Operator.Union; 1025 case '-': 1026 return Operator.Difference; 1027 case '~': 1028 return Operator.SymDifference; 1029 case '&': 1030 return Operator.Intersection; 1031 default: 1032 assert(false); 1033 } 1034 } 1035 1036 L_CharTermLoop: 1037 for (;;) 1038 { 1039 final switch (state) 1040 { 1041 case State.Start: 1042 switch (current) 1043 { 1044 case '|': 1045 case '-': 1046 case '~': 1047 case '&': 1048 state = State.PotentialTwinSymbolOperator; 1049 last = current; 1050 break; 1051 case '[': 1052 op = Operator.Union; 1053 goto case; 1054 case ']': 1055 break L_CharTermLoop; 1056 case '\\': 1057 state = State.Escape; 1058 break; 1059 default: 1060 state = State.Char; 1061 last = current; 1062 } 1063 break; 1064 case State.Char: 1065 // xxx last current xxx 1066 switch (current) 1067 { 1068 case '|': 1069 case '~': 1070 case '&': 1071 // then last is treated as normal char and added as implicit union 1072 state = State.PotentialTwinSymbolOperator; 1073 addWithFlags(set, last, re_flags); 1074 last = current; 1075 break; 1076 case '-': // still need more info 1077 state = State.CharDash; 1078 break; 1079 case '\\': 1080 set |= last; 1081 state = State.Escape; 1082 break; 1083 case '[': 1084 op = Operator.Union; 1085 goto case; 1086 case ']': 1087 addWithFlags(set, last, re_flags); 1088 break L_CharTermLoop; 1089 default: 1090 state = State.Char; 1091 addWithFlags(set, last, re_flags); 1092 last = current; 1093 } 1094 break; 1095 case State.PotentialTwinSymbolOperator: 1096 // xxx last current xxxx 1097 // where last = [|-&~] 1098 if (current == last) 1099 { 1100 op = twinSymbolOperator(last); 1101 next();//skip second twin char 1102 break L_CharTermLoop; 1103 } 1104 goto case State.Char; 1105 case State.Escape: 1106 // xxx \ current xxx 1107 switch (current) 1108 { 1109 case 'f': 1110 last = '\f'; 1111 state = State.Char; 1112 break; 1113 case 'n': 1114 last = '\n'; 1115 state = State.Char; 1116 break; 1117 case 'r': 1118 last = '\r'; 1119 state = State.Char; 1120 break; 1121 case 't': 1122 last = '\t'; 1123 state = State.Char; 1124 break; 1125 case 'v': 1126 last = '\v'; 1127 state = State.Char; 1128 break; 1129 case 'c': 1130 last = parseControlCode(); 1131 state = State.Char; 1132 break; 1133 foreach (val; Escapables) 1134 { 1135 case val: 1136 } 1137 last = current; 1138 state = State.Char; 1139 break; 1140 case 'p': 1141 set.add(parseUnicodePropertySpec(false)); 1142 state = State.Start; 1143 continue L_CharTermLoop; //next char already fetched 1144 case 'P': 1145 set.add(parseUnicodePropertySpec(true)); 1146 state = State.Start; 1147 continue L_CharTermLoop; //next char already fetched 1148 case 'x': 1149 last = parseUniHex(pat, 2); 1150 state = State.Char; 1151 break; 1152 case 'u': 1153 last = parseUniHex(pat, 4); 1154 state = State.Char; 1155 break; 1156 case 'U': 1157 last = parseUniHex(pat, 8); 1158 state = State.Char; 1159 break; 1160 case 'd': 1161 set.add(unicode.Nd); 1162 state = State.Start; 1163 break; 1164 case 'D': 1165 set.add(unicode.Nd.inverted); 1166 state = State.Start; 1167 break; 1168 case 's': 1169 set.add(unicode.White_Space); 1170 state = State.Start; 1171 break; 1172 case 'S': 1173 set.add(unicode.White_Space.inverted); 1174 state = State.Start; 1175 break; 1176 case 'w': 1177 set.add(wordCharacter); 1178 state = State.Start; 1179 break; 1180 case 'W': 1181 set.add(wordCharacter.inverted); 1182 state = State.Start; 1183 break; 1184 default: 1185 if (current >= privateUseStart && current <= privateUseEnd) 1186 enforce(false, "no matching ']' found while parsing character class"); 1187 enforce(false, "invalid escape sequence"); 1188 } 1189 break; 1190 case State.CharDash: 1191 // xxx last - current xxx 1192 switch (current) 1193 { 1194 case '[': 1195 op = Operator.Union; 1196 goto case; 1197 case ']': 1198 //means dash is a single char not an interval specifier 1199 addWithFlags(set, last, re_flags); 1200 addWithFlags(set, '-', re_flags); 1201 break L_CharTermLoop; 1202 case '-'://set Difference again 1203 addWithFlags(set, last, re_flags); 1204 op = Operator.Difference; 1205 next();//skip '-' 1206 break L_CharTermLoop; 1207 case '\\': 1208 state = State.CharDashEscape; 1209 break; 1210 default: 1211 enforce(last <= current, "inverted range"); 1212 if (re_flags & RegexOption.casefold) 1213 { 1214 for (uint ch = last; ch <= current; ch++) 1215 addWithFlags(set, ch, re_flags); 1216 } 1217 else 1218 set.add(last, current + 1); 1219 state = State.Start; 1220 } 1221 break; 1222 case State.CharDashEscape: 1223 //xxx last - \ current xxx 1224 uint end; 1225 switch (current) 1226 { 1227 case 'f': 1228 end = '\f'; 1229 break; 1230 case 'n': 1231 end = '\n'; 1232 break; 1233 case 'r': 1234 end = '\r'; 1235 break; 1236 case 't': 1237 end = '\t'; 1238 break; 1239 case 'v': 1240 end = '\v'; 1241 break; 1242 foreach (val; Escapables) 1243 { 1244 case val: 1245 } 1246 end = current; 1247 break; 1248 case 'c': 1249 end = parseControlCode(); 1250 break; 1251 case 'x': 1252 end = parseUniHex(pat, 2); 1253 break; 1254 case 'u': 1255 end = parseUniHex(pat, 4); 1256 break; 1257 case 'U': 1258 end = parseUniHex(pat, 8); 1259 break; 1260 default: 1261 if (current >= privateUseStart && current <= privateUseEnd) 1262 enforce(false, "no matching ']' found while parsing character class"); 1263 error("invalid escape sequence"); 1264 } 1265 // Lookahead to check if it's a \T 1266 // where T is sub-pattern terminator in multi-pattern scheme 1267 if (end == '\\' && !pat.empty) 1268 { 1269 if (pat.front >= privateUseStart && pat.front <= privateUseEnd) 1270 enforce(false, "invalid escape sequence"); 1271 } 1272 enforce(last <= end,"inverted range"); 1273 set.add(last, end + 1); 1274 state = State.Start; 1275 break; 1276 } 1277 enforce(next(), "unexpected end of CodepointSet"); 1278 } 1279 return tuple(set, op); 1280 } 1281 1282 alias ValStack = Stack!(CodepointSet); 1283 alias OpStack = Stack!(Operator); 1284 1285 //parse and store IR for CodepointSet 1286 void parseCharset() 1287 { 1288 const save = re_flags; 1289 re_flags &= ~RegexOption.freeform; // stop ignoring whitespace if we did 1290 parseCharsetImpl(); 1291 re_flags = save; 1292 // Last next() in parseCharsetImp is executed w/o freeform flag 1293 if (re_flags & RegexOption.freeform) skipSpace(); 1294 } 1295 1296 void parseCharsetImpl() 1297 { 1298 ValStack vstack; 1299 OpStack opstack; 1300 import std.functional : unaryFun; 1301 // 1302 static bool apply(Operator op, ref ValStack stack) 1303 { 1304 switch (op) 1305 { 1306 case Operator.Negate: 1307 enforce(!stack.empty, "no operand for '^'"); 1308 stack.top = stack.top.inverted; 1309 break; 1310 case Operator.Union: 1311 auto s = stack.pop();//2nd operand 1312 enforce(!stack.empty, "no operand for '||'"); 1313 stack.top.add(s); 1314 break; 1315 case Operator.Difference: 1316 auto s = stack.pop();//2nd operand 1317 enforce(!stack.empty, "no operand for '--'"); 1318 stack.top.sub(s); 1319 break; 1320 case Operator.SymDifference: 1321 auto s = stack.pop();//2nd operand 1322 enforce(!stack.empty, "no operand for '~~'"); 1323 stack.top ~= s; 1324 break; 1325 case Operator.Intersection: 1326 auto s = stack.pop();//2nd operand 1327 enforce(!stack.empty, "no operand for '&&'"); 1328 stack.top.intersect(s); 1329 break; 1330 default: 1331 return false; 1332 } 1333 return true; 1334 } 1335 static bool unrollWhile(alias cond)(ref ValStack vstack, ref OpStack opstack) 1336 { 1337 while (cond(opstack.top)) 1338 { 1339 if (!apply(opstack.pop(),vstack)) 1340 return false;//syntax error 1341 if (opstack.empty) 1342 return false; 1343 } 1344 return true; 1345 } 1346 1347 L_CharsetLoop: 1348 do 1349 { 1350 switch (current) 1351 { 1352 case '[': 1353 opstack.push(Operator.Open); 1354 enforce(next(), "unexpected end of character class"); 1355 if (current == '^') 1356 { 1357 opstack.push(Operator.Negate); 1358 enforce(next(), "unexpected end of character class"); 1359 } 1360 else if (current == ']') // []...] is special cased 1361 { 1362 enforce(next(), "wrong character set"); 1363 auto pair = parseCharTerm(); 1364 pair[0].add(']', ']'+1); 1365 if (pair[1] != Operator.None) 1366 { 1367 if (opstack.top == Operator.Union) 1368 unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); 1369 opstack.push(pair[1]); 1370 } 1371 vstack.push(pair[0]); 1372 } 1373 break; 1374 case ']': 1375 enforce(unrollWhile!(unaryFun!"a != a.Open")(vstack, opstack), 1376 "character class syntax error"); 1377 enforce(!opstack.empty, "unmatched ']'"); 1378 opstack.pop(); 1379 if (!next() || opstack.empty) 1380 break L_CharsetLoop; 1381 auto pair = parseCharTerm(); 1382 if (!pair[0].empty)//not only operator e.g. -- or ~~ 1383 { 1384 vstack.top.add(pair[0]);//apply union 1385 } 1386 if (pair[1] != Operator.None) 1387 { 1388 if (opstack.top == Operator.Union) 1389 unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); 1390 opstack.push(pair[1]); 1391 } 1392 break; 1393 // 1394 default://yet another pair of term(op)? 1395 auto pair = parseCharTerm(); 1396 if (pair[1] != Operator.None) 1397 { 1398 if (opstack.top == Operator.Union) 1399 unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); 1400 opstack.push(pair[1]); 1401 } 1402 vstack.push(pair[0]); 1403 } 1404 }while (!empty || !opstack.empty); 1405 while (!opstack.empty) 1406 { 1407 enforce(opstack.top != Operator.Open, 1408 "no matching ']' found while parsing character class"); 1409 apply(opstack.pop(), vstack); 1410 } 1411 assert(vstack.length == 1); 1412 g.charsetToIr(vstack.top); 1413 } 1414 1415 //parse and generate IR for escape stand alone escape sequence 1416 @trusted void parseEscape() 1417 {//accesses array of appender 1418 import std.algorithm.iteration : sum; 1419 switch (current) 1420 { 1421 case 'f': next(); g.put(Bytecode(IR.Char, '\f')); break; 1422 case 'n': next(); g.put(Bytecode(IR.Char, '\n')); break; 1423 case 'r': next(); g.put(Bytecode(IR.Char, '\r')); break; 1424 case 't': next(); g.put(Bytecode(IR.Char, '\t')); break; 1425 case 'v': next(); g.put(Bytecode(IR.Char, '\v')); break; 1426 1427 case 'd': 1428 next(); 1429 g.charsetToIr(unicode.Nd); 1430 break; 1431 case 'D': 1432 next(); 1433 g.charsetToIr(unicode.Nd.inverted); 1434 break; 1435 case 'b': next(); g.put(Bytecode(IR.Wordboundary, 0)); break; 1436 case 'B': next(); g.put(Bytecode(IR.Notwordboundary, 0)); break; 1437 case 's': 1438 next(); 1439 g.charsetToIr(unicode.White_Space); 1440 break; 1441 case 'S': 1442 next(); 1443 g.charsetToIr(unicode.White_Space.inverted); 1444 break; 1445 case 'w': 1446 next(); 1447 g.charsetToIr(wordCharacter); 1448 break; 1449 case 'W': 1450 next(); 1451 g.charsetToIr(wordCharacter.inverted); 1452 break; 1453 case 'p': case 'P': 1454 auto CodepointSet = parseUnicodePropertySpec(current == 'P'); 1455 g.charsetToIr(CodepointSet); 1456 break; 1457 case 'x': 1458 immutable code = parseUniHex(pat, 2); 1459 next(); 1460 g.put(Bytecode(IR.Char,code)); 1461 break; 1462 case 'u': case 'U': 1463 immutable code = parseUniHex(pat, current == 'u' ? 4 : 8); 1464 next(); 1465 g.put(Bytecode(IR.Char, code)); 1466 break; 1467 case 'c': //control codes 1468 Bytecode code = Bytecode(IR.Char, parseControlCode()); 1469 next(); 1470 g.put(code); 1471 break; 1472 case '0': 1473 next(); 1474 g.put(Bytecode(IR.Char, 0));//NUL character 1475 break; 1476 case '1': .. case '9': 1477 uint nref = cast(uint) current - '0'; 1478 immutable maxBackref = sum(g.groupStack.data); 1479 enforce(nref < maxBackref, "Backref to unseen group"); 1480 //perl's disambiguation rule i.e. 1481 //get next digit only if there is such group number 1482 while (nref < maxBackref && next() && std.ascii.isDigit(current)) 1483 { 1484 nref = nref * 10 + current - '0'; 1485 } 1486 if (nref >= maxBackref) 1487 nref /= 10; 1488 enforce(!g.isOpenGroup(nref), "Backref to open group"); 1489 uint localLimit = maxBackref - g.groupStack.top; 1490 if (nref >= localLimit) 1491 { 1492 g.put(Bytecode(IR.Backref, nref-localLimit)); 1493 g.ir[$-1].setLocalRef(); 1494 } 1495 else 1496 g.put(Bytecode(IR.Backref, nref)); 1497 g.markBackref(nref); 1498 break; 1499 default: 1500 // Lookahead to check if it's a \T 1501 // where T is sub-pattern terminator in multi-pattern scheme 1502 if (current == '\\' && !pat.empty) 1503 { 1504 if (pat.front >= privateUseStart && current <= privateUseEnd) 1505 enforce(false, "invalid escape sequence"); 1506 } 1507 if (current >= privateUseStart && current <= privateUseEnd) 1508 { 1509 g.endPattern(current - privateUseStart + 1); 1510 break; 1511 } 1512 auto op = Bytecode(IR.Char, current); 1513 next(); 1514 g.put(op); 1515 } 1516 } 1517 1518 //parse and return a CodepointSet for \p{...Property...} and \P{...Property..}, 1519 //\ - assumed to be processed, p - is current 1520 CodepointSet parseUnicodePropertySpec(bool negated) 1521 { 1522 enum MAX_PROPERTY = 128; 1523 char[MAX_PROPERTY] result; 1524 uint k = 0; 1525 enforce(next(), "eof parsing unicode property spec"); 1526 if (current == '{') 1527 { 1528 while (k < MAX_PROPERTY && next() && current !='}' && current !=':') 1529 if (current != '-' && current != ' ' && current != '_') 1530 result[k++] = cast(char) std.ascii.toLower(current); 1531 enforce(k != MAX_PROPERTY, "invalid property name"); 1532 enforce(current == '}', "} expected "); 1533 } 1534 else 1535 {//single char properties e.g.: \pL, \pN ... 1536 enforce(current < 0x80, "invalid property name"); 1537 result[k++] = cast(char) current; 1538 } 1539 auto s = getUnicodeSet(result[0 .. k], negated, 1540 cast(bool)(re_flags & RegexOption.casefold)); 1541 enforce(!s.empty, "unrecognized unicode property spec"); 1542 next(); 1543 return s; 1544 } 1545 1546 // 1547 @trusted void error(string msg) 1548 { 1549 import std.array : appender; 1550 import std.format : formattedWrite; 1551 auto app = appender!string(); 1552 formattedWrite(app, "%s\nPattern with error: `%s` <--HERE-- `%s`", 1553 msg, origin[0..$-pat.length], pat); 1554 throw new RegexException(app.data); 1555 } 1556 1557 alias Char = BasicElementOf!R; 1558 1559 @property program() 1560 { 1561 return makeRegex(this); 1562 } 1563 } 1564 1565 /+ 1566 Postproces the IR, then optimize. 1567 +/ 1568 @trusted void postprocess(Char)(ref Regex!Char zis) 1569 {//@@@BUG@@@ write is @system 1570 with(zis) 1571 { 1572 struct FixedStack(T) 1573 { 1574 T[] arr; 1575 uint _top; 1576 //this(T[] storage){ arr = storage; _top = -1; } 1577 @property ref T top(){ assert(!empty); return arr[_top]; } 1578 void push(T x){ arr[++_top] = x; } 1579 T pop() { assert(!empty); return arr[_top--]; } 1580 @property bool empty(){ return _top == -1; } 1581 } 1582 auto counterRange = FixedStack!uint(new uint[maxCounterDepth+1], -1); 1583 counterRange.push(1); 1584 ulong cumRange = 0; 1585 for (uint i = 0; i < ir.length; i += ir[i].length) 1586 { 1587 if (ir[i].hotspot) 1588 { 1589 assert(i + 1 < ir.length, 1590 "unexpected end of IR while looking for hotspot"); 1591 ir[i+1] = Bytecode.fromRaw(hotspotTableSize); 1592 hotspotTableSize += counterRange.top; 1593 } 1594 switch (ir[i].code) 1595 { 1596 case IR.RepeatStart, IR.RepeatQStart: 1597 uint repEnd = cast(uint)(i + ir[i].data + IRL!(IR.RepeatStart)); 1598 assert(ir[repEnd].code == ir[i].paired.code); 1599 immutable max = ir[repEnd + 4].raw; 1600 ir[repEnd+2].raw = counterRange.top; 1601 ir[repEnd+3].raw *= counterRange.top; 1602 ir[repEnd+4].raw *= counterRange.top; 1603 ulong cntRange = cast(ulong)(max)*counterRange.top; 1604 cumRange += cntRange; 1605 enforce(cumRange < maxCumulativeRepetitionLength, 1606 "repetition length limit is exceeded"); 1607 counterRange.push(cast(uint) cntRange + counterRange.top); 1608 threadCount += counterRange.top; 1609 break; 1610 case IR.RepeatEnd, IR.RepeatQEnd: 1611 threadCount += counterRange.top; 1612 counterRange.pop(); 1613 break; 1614 case IR.GroupStart: 1615 if (isBackref(ir[i].data)) 1616 ir[i].setBackrefence(); 1617 threadCount += counterRange.top; 1618 break; 1619 case IR.GroupEnd: 1620 if (isBackref(ir[i].data)) 1621 ir[i].setBackrefence(); 1622 threadCount += counterRange.top; 1623 break; 1624 default: 1625 threadCount += counterRange.top; 1626 } 1627 } 1628 checkIfOneShot(); 1629 if (!(flags & RegexInfo.oneShot)) 1630 kickstart = Kickstart!Char(zis, new uint[](256)); 1631 debug(std_regex_allocation) writefln("IR processed, max threads: %d", threadCount); 1632 optimize(zis); 1633 } 1634 } 1635 1636 void fixupBytecode()(Bytecode[] ir) 1637 { 1638 Stack!uint fixups; 1639 1640 with(IR) for (uint i=0; i<ir.length; i+= ir[i].length) 1641 { 1642 if (ir[i].isStart || ir[i].code == Option) 1643 fixups.push(i); 1644 else if (ir[i].code == OrEnd) 1645 { 1646 // Alternatives need more care 1647 auto j = fixups.pop(); // last Option 1648 ir[j].data = i - j - ir[j].length; 1649 j = fixups.pop(); // OrStart 1650 ir[j].data = i - j - ir[j].length; 1651 ir[i].data = ir[j].data; 1652 1653 // fixup all GotoEndOrs 1654 j = j + IRL!(OrStart); 1655 assert(ir[j].code == Option); 1656 for (;;) 1657 { 1658 auto next = j + ir[j].data + IRL!(Option); 1659 if (ir[next].code == IR.OrEnd) 1660 break; 1661 ir[next - IRL!(GotoEndOr)].data = i - next; 1662 j = next; 1663 } 1664 } 1665 else if (ir[i].code == GotoEndOr) 1666 { 1667 auto j = fixups.pop(); // Option 1668 ir[j].data = i - j + IRL!(GotoEndOr)- IRL!(Option); // to the next option 1669 } 1670 else if (ir[i].isEnd) 1671 { 1672 auto j = fixups.pop(); 1673 ir[i].data = i - j - ir[j].length; 1674 ir[j].data = ir[i].data; 1675 } 1676 } 1677 assert(fixups.empty); 1678 } 1679 1680 void optimize(Char)(ref Regex!Char zis) 1681 { 1682 import std.array : insertInPlace; 1683 CodepointSet nextSet(uint idx) 1684 { 1685 CodepointSet set; 1686 with(zis) with(IR) 1687 Outer: 1688 for (uint i = idx; i < ir.length; i += ir[i].length) 1689 { 1690 switch (ir[i].code) 1691 { 1692 case Char: 1693 set.add(ir[i].data, ir[i].data+1); 1694 goto default; 1695 //TODO: OrChar 1696 case Trie, CodepointSet: 1697 set = zis.charsets[ir[i].data]; 1698 goto default; 1699 case GroupStart,GroupEnd: 1700 break; 1701 default: 1702 break Outer; 1703 } 1704 } 1705 return set; 1706 } 1707 1708 with(zis) with(IR) for (uint i = 0; i < ir.length; i += ir[i].length) 1709 { 1710 if (ir[i].code == InfiniteEnd) 1711 { 1712 auto set = nextSet(i+IRL!(InfiniteEnd)); 1713 if (!set.empty && set.length < 10_000) 1714 { 1715 ir[i] = Bytecode(InfiniteBloomEnd, ir[i].data); 1716 ir[i - ir[i].data - IRL!(InfiniteStart)] = 1717 Bytecode(InfiniteBloomStart, ir[i].data); 1718 ir.insertInPlace(i+IRL!(InfiniteEnd), 1719 Bytecode.fromRaw(cast(uint) zis.filters.length)); 1720 zis.filters ~= BitTable(set); 1721 fixupBytecode(ir); 1722 } 1723 } 1724 } 1725 } 1726 1727 //IR code validator - proper nesting, illegal instructions, etc. 1728 @trusted void validateRe(Char)(ref Regex!Char zis) 1729 {//@@@BUG@@@ text is @system 1730 import std.conv : text; 1731 with(zis) 1732 { 1733 for (uint pc = 0; pc < ir.length; pc += ir[pc].length) 1734 { 1735 if (ir[pc].isStart || ir[pc].isEnd) 1736 { 1737 immutable dest = ir[pc].indexOfPair(pc); 1738 assert(dest < ir.length, text("Wrong length in opcode at pc=", 1739 pc, " ", dest, " vs ", ir.length)); 1740 assert(ir[dest].paired == ir[pc], 1741 text("Wrong pairing of opcodes at pc=", pc, "and pc=", dest)); 1742 } 1743 else if (ir[pc].isAtom) 1744 { 1745 1746 } 1747 else 1748 assert(0, text("Unknown type of instruction at pc=", pc)); 1749 } 1750 } 1751 } 1752