1 /* 2 Kickstart is a coarse-grained "filter" engine that finds likely matches 3 to be verified by full-blown matcher. 4 */ 5 module std.regex.internal.kickstart; 6 7 package(std.regex): 8 9 import std.range.primitives, std.utf; 10 import std.regex.internal.ir; 11 12 //utility for shiftOr, returns a minimum number of bytes to test in a Char 13 uint effectiveSize(Char)() 14 { 15 static if (is(Char == char)) 16 return 1; 17 else static if (is(Char == wchar)) 18 return 2; 19 else static if (is(Char == dchar)) 20 return 3; 21 else 22 static assert(0); 23 } 24 25 /* 26 Kickstart engine using ShiftOr algorithm, 27 a bit parallel technique for inexact string searching. 28 */ 29 struct ShiftOr(Char) 30 { 31 private: 32 uint[] table; 33 uint fChar; 34 uint n_length; 35 enum charSize = effectiveSize!Char(); 36 //maximum number of chars in CodepointSet to process 37 enum uint charsetThreshold = 32_000; 38 static struct ShiftThread 39 { 40 uint[] tab; 41 uint mask; 42 uint idx; 43 uint pc, counter, hops; 44 this(uint newPc, uint newCounter, uint[] table) 45 { 46 pc = newPc; 47 counter = newCounter; 48 mask = 1; 49 idx = 0; 50 hops = 0; 51 tab = table; 52 } 53 54 void setMask(uint idx, uint mask) 55 { 56 tab[idx] |= mask; 57 } 58 59 void setInvMask(uint idx, uint mask) 60 { 61 tab[idx] &= ~mask; 62 } 63 64 void set(alias setBits = setInvMask)(dchar ch) 65 { 66 static if (charSize == 3) 67 { 68 uint val = ch, tmask = mask; 69 setBits(val&0xFF, tmask); 70 tmask <<= 1; 71 val >>= 8; 72 setBits(val&0xFF, tmask); 73 tmask <<= 1; 74 val >>= 8; 75 assert(val <= 0x10); 76 setBits(val, tmask); 77 tmask <<= 1; 78 } 79 else 80 { 81 Char[dchar.sizeof/Char.sizeof] buf; 82 uint tmask = mask; 83 size_t total = encode(buf, ch); 84 for (size_t i = 0; i < total; i++, tmask<<=1) 85 { 86 static if (charSize == 1) 87 setBits(buf[i], tmask); 88 else static if (charSize == 2) 89 { 90 setBits(buf[i]&0xFF, tmask); 91 tmask <<= 1; 92 setBits(buf[i]>>8, tmask); 93 } 94 } 95 } 96 } 97 void add(dchar ch){ return set!setInvMask(ch); } 98 void advance(uint s) 99 { 100 mask <<= s; 101 idx += s; 102 } 103 @property bool full(){ return !mask; } 104 } 105 106 static ShiftThread fork(ShiftThread t, uint newPc, uint newCounter) 107 { 108 ShiftThread nt = t; 109 nt.pc = newPc; 110 nt.counter = newCounter; 111 return nt; 112 } 113 114 @trusted static ShiftThread fetch(ref ShiftThread[] worklist) 115 { 116 auto t = worklist[$-1]; 117 worklist.length -= 1; 118 if (!__ctfe) 119 cast(void) worklist.assumeSafeAppend(); 120 return t; 121 } 122 123 static uint charLen(uint ch) 124 { 125 assert(ch <= 0x10FFFF); 126 return codeLength!Char(cast(dchar) ch)*charSize; 127 } 128 129 public: 130 @trusted this(ref Regex!Char re, uint[] memory) 131 { 132 static import std.algorithm.comparison; 133 import std.algorithm.searching : countUntil; 134 import std.conv : text; 135 import std.range : assumeSorted; 136 assert(memory.length == 256); 137 fChar = uint.max; 138 // FNV-1a flavored hash (uses 32bits at a time) 139 ulong hash(uint[] tab) 140 { 141 ulong h = 0xcbf29ce484222325; 142 foreach (v; tab) 143 { 144 h ^= v; 145 h *= 0x100000001b3; 146 } 147 return h; 148 } 149 L_FindChar: 150 for (size_t i = 0;;) 151 { 152 switch (re.ir[i].code) 153 { 154 case IR.Char: 155 fChar = re.ir[i].data; 156 static if (charSize != 3) 157 { 158 Char[dchar.sizeof/Char.sizeof] buf; 159 encode(buf, fChar); 160 fChar = buf[0]; 161 } 162 fChar = fChar & 0xFF; 163 break L_FindChar; 164 case IR.GroupStart, IR.GroupEnd: 165 i += IRL!(IR.GroupStart); 166 break; 167 case IR.Bof, IR.Bol, IR.Wordboundary, IR.Notwordboundary: 168 i += IRL!(IR.Bol); 169 break; 170 default: 171 break L_FindChar; 172 } 173 } 174 table = memory; 175 table[] = uint.max; 176 alias MergeTab = bool[ulong]; 177 // use reasonably complex hash to identify equivalent tables 178 auto merge = new MergeTab[re.hotspotTableSize]; 179 ShiftThread[] trs; 180 ShiftThread t = ShiftThread(0, 0, table); 181 //locate first fixed char if any 182 n_length = 32; 183 for (;;) 184 { 185 L_Eval_Thread: 186 for (;;) 187 { 188 switch (re.ir[t.pc].code) 189 { 190 case IR.Char: 191 uint s = charLen(re.ir[t.pc].data); 192 if (t.idx+s > n_length) 193 goto L_StopThread; 194 t.add(re.ir[t.pc].data); 195 t.advance(s); 196 t.pc += IRL!(IR.Char); 197 break; 198 case IR.OrChar://assumes IRL!(OrChar) == 1 199 uint len = re.ir[t.pc].sequence; 200 uint end = t.pc + len; 201 uint[Bytecode.maxSequence] s; 202 uint numS; 203 for (uint i = 0; i < len; i++) 204 { 205 auto x = charLen(re.ir[t.pc+i].data); 206 if (countUntil(s[0 .. numS], x) < 0) 207 s[numS++] = x; 208 } 209 for (uint i = t.pc; i < end; i++) 210 { 211 t.add(re.ir[i].data); 212 } 213 for (uint i = 0; i < numS; i++) 214 { 215 auto tx = fork(t, t.pc + len, t.counter); 216 if (tx.idx + s[i] <= n_length) 217 { 218 tx.advance(s[i]); 219 trs ~= tx; 220 } 221 } 222 if (!trs.empty) 223 t = fetch(trs); 224 else 225 goto L_StopThread; 226 break; 227 case IR.CodepointSet: 228 case IR.Trie: 229 auto set = re.charsets[re.ir[t.pc].data]; 230 uint[4] s; 231 uint numS; 232 static if (charSize == 3) 233 { 234 s[0] = charSize; 235 numS = 1; 236 } 237 else 238 { 239 240 static if (charSize == 1) 241 static immutable codeBounds = [0x0, 0x7F, 0x80, 0x7FF, 0x800, 0xFFFF, 0x10000, 0x10FFFF]; 242 else //== 2 243 static immutable codeBounds = [0x0, 0xFFFF, 0x10000, 0x10FFFF]; 244 uint[] arr = new uint[set.byInterval.length * 2]; 245 size_t ofs = 0; 246 foreach (ival; set.byInterval) 247 { 248 arr[ofs++] = ival.a; 249 arr[ofs++] = ival.b; 250 } 251 auto srange = assumeSorted!"a <= b"(arr); 252 for (uint i = 0; i < codeBounds.length/2; i++) 253 { 254 auto start = srange.lowerBound(codeBounds[2*i]).length; 255 auto end = srange.lowerBound(codeBounds[2*i+1]).length; 256 if (end > start || (end == start && (end & 1))) 257 s[numS++] = (i+1)*charSize; 258 } 259 } 260 if (numS == 0 || t.idx + s[numS-1] > n_length) 261 goto L_StopThread; 262 auto chars = set.length; 263 if (chars > charsetThreshold) 264 goto L_StopThread; 265 foreach (ch; set.byCodepoint) 266 { 267 //avoid surrogate pairs 268 if (0xD800 <= ch && ch <= 0xDFFF) 269 continue; 270 t.add(ch); 271 } 272 for (uint i = 0; i < numS; i++) 273 { 274 auto tx = fork(t, t.pc + IRL!(IR.CodepointSet), t.counter); 275 tx.advance(s[i]); 276 trs ~= tx; 277 } 278 if (!trs.empty) 279 t = fetch(trs); 280 else 281 goto L_StopThread; 282 break; 283 case IR.Any: 284 goto L_StopThread; 285 286 case IR.GotoEndOr: 287 t.pc += IRL!(IR.GotoEndOr)+re.ir[t.pc].data; 288 assert(re.ir[t.pc].code == IR.OrEnd); 289 goto case; 290 case IR.OrEnd: 291 auto slot = re.ir[t.pc+1].raw+t.counter; 292 auto val = hash(t.tab); 293 if (val in merge[slot]) 294 goto L_StopThread; // merge equivalent 295 merge[slot][val] = true; 296 t.pc += IRL!(IR.OrEnd); 297 break; 298 case IR.OrStart: 299 t.pc += IRL!(IR.OrStart); 300 goto case; 301 case IR.Option: 302 uint next = t.pc + re.ir[t.pc].data + IRL!(IR.Option); 303 //queue next Option 304 if (re.ir[next].code == IR.Option) 305 { 306 trs ~= fork(t, next, t.counter); 307 } 308 t.pc += IRL!(IR.Option); 309 break; 310 case IR.RepeatStart:case IR.RepeatQStart: 311 t.pc += IRL!(IR.RepeatStart)+re.ir[t.pc].data; 312 goto case IR.RepeatEnd; 313 case IR.RepeatEnd: 314 case IR.RepeatQEnd: 315 auto slot = re.ir[t.pc+1].raw+t.counter; 316 auto val = hash(t.tab); 317 if (val in merge[slot]) 318 goto L_StopThread; // merge equivalent 319 merge[slot][val] = true; 320 uint len = re.ir[t.pc].data; 321 uint step = re.ir[t.pc+2].raw; 322 uint min = re.ir[t.pc+3].raw; 323 if (t.counter < min) 324 { 325 t.counter += step; 326 t.pc -= len; 327 break; 328 } 329 uint max = re.ir[t.pc+4].raw; 330 if (t.counter < max) 331 { 332 trs ~= fork(t, t.pc - len, t.counter + step); 333 t.counter = t.counter%step; 334 t.pc += IRL!(IR.RepeatEnd); 335 } 336 else 337 { 338 t.counter = t.counter%step; 339 t.pc += IRL!(IR.RepeatEnd); 340 } 341 break; 342 case IR.InfiniteStart, IR.InfiniteQStart: 343 t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteStart); 344 goto case IR.InfiniteEnd; //both Q and non-Q 345 case IR.InfiniteEnd: 346 case IR.InfiniteQEnd: 347 auto slot = re.ir[t.pc+1].raw+t.counter; 348 auto val = hash(t.tab); 349 if (val in merge[slot]) 350 goto L_StopThread; // merge equivalent 351 merge[slot][val] = true; 352 uint len = re.ir[t.pc].data; 353 uint pc1, pc2; //branches to take in priority order 354 if (++t.hops == 32) 355 goto L_StopThread; 356 pc1 = t.pc + IRL!(IR.InfiniteEnd); 357 pc2 = t.pc - len; 358 trs ~= fork(t, pc2, t.counter); 359 t.pc = pc1; 360 break; 361 case IR.GroupStart, IR.GroupEnd: 362 t.pc += IRL!(IR.GroupStart); 363 break; 364 case IR.Bof, IR.Bol, IR.Wordboundary, IR.Notwordboundary: 365 t.pc += IRL!(IR.Bol); 366 break; 367 case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: 368 t.pc += IRL!(IR.LookaheadStart) + IRL!(IR.LookaheadEnd) + re.ir[t.pc].data; 369 break; 370 default: 371 L_StopThread: 372 assert(re.ir[t.pc].code >= 0x80, text(re.ir[t.pc].code)); 373 debug (fred_search) writeln("ShiftOr stumbled on ",re.ir[t.pc].mnemonic); 374 n_length = std.algorithm.comparison.min(t.idx, n_length); 375 break L_Eval_Thread; 376 } 377 } 378 if (trs.empty) 379 break; 380 t = fetch(trs); 381 } 382 debug(std_regex_search) 383 { 384 writeln("Min length: ", n_length); 385 } 386 } 387 388 @property bool empty() const { return n_length == 0; } 389 390 @property uint length() const{ return n_length/charSize; } 391 392 // lookup compatible bit pattern in haystack, return starting index 393 // has a useful trait: if supplied with valid UTF indexes, 394 // returns only valid UTF indexes 395 // (that given the haystack in question is valid UTF string) 396 @trusted size_t search(const(Char)[] haystack, size_t idx) 397 {//@BUG: apparently assumes little endian machines 398 import core.stdc.string : memchr; 399 import std.conv : text; 400 assert(!empty); 401 auto p = cast(const(ubyte)*)(haystack.ptr+idx); 402 uint state = uint.max; 403 uint limit = 1u<<(n_length - 1u); 404 debug(std_regex_search) writefln("Limit: %32b",limit); 405 if (fChar != uint.max) 406 { 407 const(ubyte)* end = cast(ubyte*)(haystack.ptr + haystack.length); 408 const orginalAlign = cast(size_t) p & (Char.sizeof-1); 409 while (p != end) 410 { 411 if (!~state) 412 {//speed up seeking first matching place 413 for (;;) 414 { 415 assert(p <= end, text(p," vs ", end)); 416 p = cast(ubyte*) memchr(p, fChar, end - p); 417 if (!p) 418 return haystack.length; 419 if ((cast(size_t) p & (Char.sizeof-1)) == orginalAlign) 420 break; 421 if (++p == end) 422 return haystack.length; 423 } 424 state = ~1u; 425 assert((cast(size_t) p & (Char.sizeof-1)) == orginalAlign); 426 static if (charSize == 3) 427 { 428 state = (state << 1) | table[p[1]]; 429 state = (state << 1) | table[p[2]]; 430 p += 4; 431 } 432 else 433 p++; 434 //first char is tested, see if that's all 435 if (!(state & limit)) 436 return (p-cast(ubyte*) haystack.ptr)/Char.sizeof 437 -length; 438 } 439 else 440 {//have some bits/states for possible matches, 441 //use the usual shift-or cycle 442 static if (charSize == 3) 443 { 444 state = (state << 1) | table[p[0]]; 445 state = (state << 1) | table[p[1]]; 446 state = (state << 1) | table[p[2]]; 447 p += 4; 448 } 449 else 450 { 451 state = (state << 1) | table[p[0]]; 452 p++; 453 } 454 if (!(state & limit)) 455 return (p-cast(ubyte*) haystack.ptr)/Char.sizeof 456 -length; 457 } 458 debug(std_regex_search) writefln("State: %32b", state); 459 } 460 } 461 else 462 { 463 //normal path, partially unrolled for char/wchar 464 static if (charSize == 3) 465 { 466 const(ubyte)* end = cast(ubyte*)(haystack.ptr + haystack.length); 467 while (p != end) 468 { 469 state = (state << 1) | table[p[0]]; 470 state = (state << 1) | table[p[1]]; 471 state = (state << 1) | table[p[2]]; 472 p += 4; 473 if (!(state & limit))//division rounds down for dchar 474 return (p-cast(ubyte*) haystack.ptr)/Char.sizeof 475 -length; 476 } 477 } 478 else 479 { 480 auto len = cast(ubyte*)(haystack.ptr + haystack.length) - p; 481 size_t i = 0; 482 if (len & 1) 483 { 484 state = (state << 1) | table[p[i++]]; 485 if (!(state & limit)) 486 return idx+i/Char.sizeof-length; 487 } 488 while (i < len) 489 { 490 state = (state << 1) | table[p[i++]]; 491 if (!(state & limit)) 492 return idx+i/Char.sizeof 493 -length; 494 state = (state << 1) | table[p[i++]]; 495 if (!(state & limit)) 496 return idx+i/Char.sizeof 497 -length; 498 debug(std_regex_search) writefln("State: %32b", state); 499 } 500 } 501 } 502 return haystack.length; 503 } 504 505 @system debug static void dump(uint[] table) 506 {//@@@BUG@@@ writef(ln) is @system 507 import std.stdio : writefln; 508 for (size_t i = 0; i < table.length; i += 4) 509 { 510 writefln("%32b %32b %32b %32b",table[i], table[i+1], table[i+2], table[i+3]); 511 } 512 } 513 } 514 515 @system unittest 516 { 517 import std.conv, std.regex; 518 @trusted void test_fixed(alias Kick)() 519 { 520 foreach (i, v; AliasSeq!(char, wchar, dchar)) 521 { 522 alias Char = v; 523 alias String = immutable(v)[]; 524 auto r = regex(to!String(`abc$`)); 525 auto kick = Kick!Char(r, new uint[256]); 526 assert(kick.length == 3, text(Kick.stringof," ",v.stringof, " == ", kick.length)); 527 auto r2 = regex(to!String(`(abc){2}a+`)); 528 kick = Kick!Char(r2, new uint[256]); 529 assert(kick.length == 7, text(Kick.stringof,v.stringof," == ", kick.length)); 530 auto r3 = regex(to!String(`\b(a{2}b{3}){2,4}`)); 531 kick = Kick!Char(r3, new uint[256]); 532 assert(kick.length == 10, text(Kick.stringof,v.stringof," == ", kick.length)); 533 auto r4 = regex(to!String(`\ba{2}c\bxyz`)); 534 kick = Kick!Char(r4, new uint[256]); 535 assert(kick.length == 6, text(Kick.stringof,v.stringof, " == ", kick.length)); 536 auto r5 = regex(to!String(`\ba{2}c\b`)); 537 kick = Kick!Char(r5, new uint[256]); 538 size_t x = kick.search("aabaacaa", 0); 539 assert(x == 3, text(Kick.stringof,v.stringof," == ", kick.length)); 540 x = kick.search("aabaacaa", x+1); 541 assert(x == 8, text(Kick.stringof,v.stringof," == ", kick.length)); 542 } 543 } 544 @trusted void test_flex(alias Kick)() 545 { 546 foreach (i, v; AliasSeq!(char, wchar, dchar)) 547 { 548 alias Char = v; 549 alias String = immutable(v)[]; 550 auto r = regex(to!String(`abc[a-z]`)); 551 auto kick = Kick!Char(r, new uint[256]); 552 auto x = kick.search(to!String("abbabca"), 0); 553 assert(x == 3, text("real x is ", x, " ",v.stringof)); 554 555 auto r2 = regex(to!String(`(ax|bd|cdy)`)); 556 String s2 = to!String("abdcdyabax"); 557 kick = Kick!Char(r2, new uint[256]); 558 x = kick.search(s2, 0); 559 assert(x == 1, text("real x is ", x)); 560 x = kick.search(s2, x+1); 561 assert(x == 3, text("real x is ", x)); 562 x = kick.search(s2, x+1); 563 assert(x == 8, text("real x is ", x)); 564 auto rdot = regex(to!String(`...`)); 565 kick = Kick!Char(rdot, new uint[256]); 566 assert(kick.length == 0); 567 auto rN = regex(to!String(`a(b+|c+)x`)); 568 kick = Kick!Char(rN, new uint[256]); 569 assert(kick.length == 3, to!string(kick.length)); 570 assert(kick.search("ababx",0) == 2); 571 assert(kick.search("abaacba",0) == 3);//expected inexact 572 573 } 574 } 575 test_fixed!(ShiftOr)(); 576 test_flex!(ShiftOr)(); 577 } 578 579 alias Kickstart = ShiftOr; 580