1 /* 2 * Copyright (c) Meta Platforms, Inc. and affiliates. 3 * All rights reserved. 4 * 5 * This source code is licensed under both the BSD-style license (found in the 6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found 7 * in the COPYING file in the root directory of this source tree). 8 * You may select, at your option, one of the above-listed licenses. 9 */ 10 11 #include "zstd_compress_internal.h" 12 #include "zstd_lazy.h" 13 #include "../common/bits.h" /* ZSTD_countTrailingZeros64 */ 14 15 #if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ 16 || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ 17 || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ 18 || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) 19 20 #define kLazySkippingStep 8 21 22 23 /*-************************************* 24 * Binary Tree search 25 ***************************************/ 26 27 static 28 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 29 void ZSTD_updateDUBT(ZSTD_matchState_t* ms, 30 const BYTE* ip, const BYTE* iend, 31 U32 mls) 32 { 33 const ZSTD_compressionParameters* const cParams = &ms->cParams; 34 U32* const hashTable = ms->hashTable; 35 U32 const hashLog = cParams->hashLog; 36 37 U32* const bt = ms->chainTable; 38 U32 const btLog = cParams->chainLog - 1; 39 U32 const btMask = (1 << btLog) - 1; 40 41 const BYTE* const base = ms->window.base; 42 U32 const target = (U32)(ip - base); 43 U32 idx = ms->nextToUpdate; 44 45 if (idx != target) 46 DEBUGLOG(7, "ZSTD_updateDUBT, from %u to %u (dictLimit:%u)", 47 idx, target, ms->window.dictLimit); 48 assert(ip + 8 <= iend); /* condition for ZSTD_hashPtr */ 49 (void)iend; 50 51 assert(idx >= ms->window.dictLimit); /* condition for valid base+idx */ 52 for ( ; idx < target ; idx++) { 53 size_t const h = ZSTD_hashPtr(base + idx, hashLog, mls); /* assumption : ip + 8 <= iend */ 54 U32 const matchIndex = hashTable[h]; 55 56 U32* const nextCandidatePtr = bt + 2*(idx&btMask); 57 U32* const sortMarkPtr = nextCandidatePtr + 1; 58 59 DEBUGLOG(8, "ZSTD_updateDUBT: insert %u", idx); 60 hashTable[h] = idx; /* Update Hash Table */ 61 *nextCandidatePtr = matchIndex; /* update BT like a chain */ 62 *sortMarkPtr = ZSTD_DUBT_UNSORTED_MARK; 63 } 64 ms->nextToUpdate = target; 65 } 66 67 68 /** ZSTD_insertDUBT1() : 69 * sort one already inserted but unsorted position 70 * assumption : curr >= btlow == (curr - btmask) 71 * doesn't fail */ 72 static 73 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 74 void ZSTD_insertDUBT1(const ZSTD_matchState_t* ms, 75 U32 curr, const BYTE* inputEnd, 76 U32 nbCompares, U32 btLow, 77 const ZSTD_dictMode_e dictMode) 78 { 79 const ZSTD_compressionParameters* const cParams = &ms->cParams; 80 U32* const bt = ms->chainTable; 81 U32 const btLog = cParams->chainLog - 1; 82 U32 const btMask = (1 << btLog) - 1; 83 size_t commonLengthSmaller=0, commonLengthLarger=0; 84 const BYTE* const base = ms->window.base; 85 const BYTE* const dictBase = ms->window.dictBase; 86 const U32 dictLimit = ms->window.dictLimit; 87 const BYTE* const ip = (curr>=dictLimit) ? base + curr : dictBase + curr; 88 const BYTE* const iend = (curr>=dictLimit) ? inputEnd : dictBase + dictLimit; 89 const BYTE* const dictEnd = dictBase + dictLimit; 90 const BYTE* const prefixStart = base + dictLimit; 91 const BYTE* match; 92 U32* smallerPtr = bt + 2*(curr&btMask); 93 U32* largerPtr = smallerPtr + 1; 94 U32 matchIndex = *smallerPtr; /* this candidate is unsorted : next sorted candidate is reached through *smallerPtr, while *largerPtr contains previous unsorted candidate (which is already saved and can be overwritten) */ 95 U32 dummy32; /* to be nullified at the end */ 96 U32 const windowValid = ms->window.lowLimit; 97 U32 const maxDistance = 1U << cParams->windowLog; 98 U32 const windowLow = (curr - windowValid > maxDistance) ? curr - maxDistance : windowValid; 99 100 101 DEBUGLOG(8, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)", 102 curr, dictLimit, windowLow); 103 assert(curr >= btLow); 104 assert(ip < iend); /* condition for ZSTD_count */ 105 106 for (; nbCompares && (matchIndex > windowLow); --nbCompares) { 107 U32* const nextPtr = bt + 2*(matchIndex & btMask); 108 size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ 109 assert(matchIndex < curr); 110 /* note : all candidates are now supposed sorted, 111 * but it's still possible to have nextPtr[1] == ZSTD_DUBT_UNSORTED_MARK 112 * when a real index has the same value as ZSTD_DUBT_UNSORTED_MARK */ 113 114 if ( (dictMode != ZSTD_extDict) 115 || (matchIndex+matchLength >= dictLimit) /* both in current segment*/ 116 || (curr < dictLimit) /* both in extDict */) { 117 const BYTE* const mBase = ( (dictMode != ZSTD_extDict) 118 || (matchIndex+matchLength >= dictLimit)) ? 119 base : dictBase; 120 assert( (matchIndex+matchLength >= dictLimit) /* might be wrong if extDict is incorrectly set to 0 */ 121 || (curr < dictLimit) ); 122 match = mBase + matchIndex; 123 matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend); 124 } else { 125 match = dictBase + matchIndex; 126 matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart); 127 if (matchIndex+matchLength >= dictLimit) 128 match = base + matchIndex; /* preparation for next read of match[matchLength] */ 129 } 130 131 DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ", 132 curr, matchIndex, (U32)matchLength); 133 134 if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ 135 break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */ 136 } 137 138 if (match[matchLength] < ip[matchLength]) { /* necessarily within buffer */ 139 /* match is smaller than current */ 140 *smallerPtr = matchIndex; /* update smaller idx */ 141 commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ 142 if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop searching */ 143 DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is smaller : next => %u", 144 matchIndex, btLow, nextPtr[1]); 145 smallerPtr = nextPtr+1; /* new "candidate" => larger than match, which was smaller than target */ 146 matchIndex = nextPtr[1]; /* new matchIndex, larger than previous and closer to current */ 147 } else { 148 /* match is larger than current */ 149 *largerPtr = matchIndex; 150 commonLengthLarger = matchLength; 151 if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop searching */ 152 DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is larger => %u", 153 matchIndex, btLow, nextPtr[0]); 154 largerPtr = nextPtr; 155 matchIndex = nextPtr[0]; 156 } } 157 158 *smallerPtr = *largerPtr = 0; 159 } 160 161 162 static 163 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 164 size_t ZSTD_DUBT_findBetterDictMatch ( 165 const ZSTD_matchState_t* ms, 166 const BYTE* const ip, const BYTE* const iend, 167 size_t* offsetPtr, 168 size_t bestLength, 169 U32 nbCompares, 170 U32 const mls, 171 const ZSTD_dictMode_e dictMode) 172 { 173 const ZSTD_matchState_t * const dms = ms->dictMatchState; 174 const ZSTD_compressionParameters* const dmsCParams = &dms->cParams; 175 const U32 * const dictHashTable = dms->hashTable; 176 U32 const hashLog = dmsCParams->hashLog; 177 size_t const h = ZSTD_hashPtr(ip, hashLog, mls); 178 U32 dictMatchIndex = dictHashTable[h]; 179 180 const BYTE* const base = ms->window.base; 181 const BYTE* const prefixStart = base + ms->window.dictLimit; 182 U32 const curr = (U32)(ip-base); 183 const BYTE* const dictBase = dms->window.base; 184 const BYTE* const dictEnd = dms->window.nextSrc; 185 U32 const dictHighLimit = (U32)(dms->window.nextSrc - dms->window.base); 186 U32 const dictLowLimit = dms->window.lowLimit; 187 U32 const dictIndexDelta = ms->window.lowLimit - dictHighLimit; 188 189 U32* const dictBt = dms->chainTable; 190 U32 const btLog = dmsCParams->chainLog - 1; 191 U32 const btMask = (1 << btLog) - 1; 192 U32 const btLow = (btMask >= dictHighLimit - dictLowLimit) ? dictLowLimit : dictHighLimit - btMask; 193 194 size_t commonLengthSmaller=0, commonLengthLarger=0; 195 196 (void)dictMode; 197 assert(dictMode == ZSTD_dictMatchState); 198 199 for (; nbCompares && (dictMatchIndex > dictLowLimit); --nbCompares) { 200 U32* const nextPtr = dictBt + 2*(dictMatchIndex & btMask); 201 size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ 202 const BYTE* match = dictBase + dictMatchIndex; 203 matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart); 204 if (dictMatchIndex+matchLength >= dictHighLimit) 205 match = base + dictMatchIndex + dictIndexDelta; /* to prepare for next usage of match[matchLength] */ 206 207 if (matchLength > bestLength) { 208 U32 matchIndex = dictMatchIndex + dictIndexDelta; 209 if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) { 210 DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)", 211 curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex); 212 bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); 213 } 214 if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */ 215 break; /* drop, to guarantee consistency (miss a little bit of compression) */ 216 } 217 } 218 219 if (match[matchLength] < ip[matchLength]) { 220 if (dictMatchIndex <= btLow) { break; } /* beyond tree size, stop the search */ 221 commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ 222 dictMatchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */ 223 } else { 224 /* match is larger than current */ 225 if (dictMatchIndex <= btLow) { break; } /* beyond tree size, stop the search */ 226 commonLengthLarger = matchLength; 227 dictMatchIndex = nextPtr[0]; 228 } 229 } 230 231 if (bestLength >= MINMATCH) { 232 U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex; 233 DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)", 234 curr, (U32)bestLength, (U32)*offsetPtr, mIndex); 235 } 236 return bestLength; 237 238 } 239 240 241 static 242 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 243 size_t ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, 244 const BYTE* const ip, const BYTE* const iend, 245 size_t* offBasePtr, 246 U32 const mls, 247 const ZSTD_dictMode_e dictMode) 248 { 249 const ZSTD_compressionParameters* const cParams = &ms->cParams; 250 U32* const hashTable = ms->hashTable; 251 U32 const hashLog = cParams->hashLog; 252 size_t const h = ZSTD_hashPtr(ip, hashLog, mls); 253 U32 matchIndex = hashTable[h]; 254 255 const BYTE* const base = ms->window.base; 256 U32 const curr = (U32)(ip-base); 257 U32 const windowLow = ZSTD_getLowestMatchIndex(ms, curr, cParams->windowLog); 258 259 U32* const bt = ms->chainTable; 260 U32 const btLog = cParams->chainLog - 1; 261 U32 const btMask = (1 << btLog) - 1; 262 U32 const btLow = (btMask >= curr) ? 0 : curr - btMask; 263 U32 const unsortLimit = MAX(btLow, windowLow); 264 265 U32* nextCandidate = bt + 2*(matchIndex&btMask); 266 U32* unsortedMark = bt + 2*(matchIndex&btMask) + 1; 267 U32 nbCompares = 1U << cParams->searchLog; 268 U32 nbCandidates = nbCompares; 269 U32 previousCandidate = 0; 270 271 DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", curr); 272 assert(ip <= iend-8); /* required for h calculation */ 273 assert(dictMode != ZSTD_dedicatedDictSearch); 274 275 /* reach end of unsorted candidates list */ 276 while ( (matchIndex > unsortLimit) 277 && (*unsortedMark == ZSTD_DUBT_UNSORTED_MARK) 278 && (nbCandidates > 1) ) { 279 DEBUGLOG(8, "ZSTD_DUBT_findBestMatch: candidate %u is unsorted", 280 matchIndex); 281 *unsortedMark = previousCandidate; /* the unsortedMark becomes a reversed chain, to move up back to original position */ 282 previousCandidate = matchIndex; 283 matchIndex = *nextCandidate; 284 nextCandidate = bt + 2*(matchIndex&btMask); 285 unsortedMark = bt + 2*(matchIndex&btMask) + 1; 286 nbCandidates --; 287 } 288 289 /* nullify last candidate if it's still unsorted 290 * simplification, detrimental to compression ratio, beneficial for speed */ 291 if ( (matchIndex > unsortLimit) 292 && (*unsortedMark==ZSTD_DUBT_UNSORTED_MARK) ) { 293 DEBUGLOG(7, "ZSTD_DUBT_findBestMatch: nullify last unsorted candidate %u", 294 matchIndex); 295 *nextCandidate = *unsortedMark = 0; 296 } 297 298 /* batch sort stacked candidates */ 299 matchIndex = previousCandidate; 300 while (matchIndex) { /* will end on matchIndex == 0 */ 301 U32* const nextCandidateIdxPtr = bt + 2*(matchIndex&btMask) + 1; 302 U32 const nextCandidateIdx = *nextCandidateIdxPtr; 303 ZSTD_insertDUBT1(ms, matchIndex, iend, 304 nbCandidates, unsortLimit, dictMode); 305 matchIndex = nextCandidateIdx; 306 nbCandidates++; 307 } 308 309 /* find longest match */ 310 { size_t commonLengthSmaller = 0, commonLengthLarger = 0; 311 const BYTE* const dictBase = ms->window.dictBase; 312 const U32 dictLimit = ms->window.dictLimit; 313 const BYTE* const dictEnd = dictBase + dictLimit; 314 const BYTE* const prefixStart = base + dictLimit; 315 U32* smallerPtr = bt + 2*(curr&btMask); 316 U32* largerPtr = bt + 2*(curr&btMask) + 1; 317 U32 matchEndIdx = curr + 8 + 1; 318 U32 dummy32; /* to be nullified at the end */ 319 size_t bestLength = 0; 320 321 matchIndex = hashTable[h]; 322 hashTable[h] = curr; /* Update Hash Table */ 323 324 for (; nbCompares && (matchIndex > windowLow); --nbCompares) { 325 U32* const nextPtr = bt + 2*(matchIndex & btMask); 326 size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ 327 const BYTE* match; 328 329 if ((dictMode != ZSTD_extDict) || (matchIndex+matchLength >= dictLimit)) { 330 match = base + matchIndex; 331 matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend); 332 } else { 333 match = dictBase + matchIndex; 334 matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart); 335 if (matchIndex+matchLength >= dictLimit) 336 match = base + matchIndex; /* to prepare for next usage of match[matchLength] */ 337 } 338 339 if (matchLength > bestLength) { 340 if (matchLength > matchEndIdx - matchIndex) 341 matchEndIdx = matchIndex + (U32)matchLength; 342 if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) ) 343 bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex); 344 if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ 345 if (dictMode == ZSTD_dictMatchState) { 346 nbCompares = 0; /* in addition to avoiding checking any 347 * further in this loop, make sure we 348 * skip checking in the dictionary. */ 349 } 350 break; /* drop, to guarantee consistency (miss a little bit of compression) */ 351 } 352 } 353 354 if (match[matchLength] < ip[matchLength]) { 355 /* match is smaller than current */ 356 *smallerPtr = matchIndex; /* update smaller idx */ 357 commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ 358 if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop the search */ 359 smallerPtr = nextPtr+1; /* new "smaller" => larger of match */ 360 matchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */ 361 } else { 362 /* match is larger than current */ 363 *largerPtr = matchIndex; 364 commonLengthLarger = matchLength; 365 if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop the search */ 366 largerPtr = nextPtr; 367 matchIndex = nextPtr[0]; 368 } } 369 370 *smallerPtr = *largerPtr = 0; 371 372 assert(nbCompares <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */ 373 if (dictMode == ZSTD_dictMatchState && nbCompares) { 374 bestLength = ZSTD_DUBT_findBetterDictMatch( 375 ms, ip, iend, 376 offBasePtr, bestLength, nbCompares, 377 mls, dictMode); 378 } 379 380 assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */ 381 ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */ 382 if (bestLength >= MINMATCH) { 383 U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex; 384 DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)", 385 curr, (U32)bestLength, (U32)*offBasePtr, mIndex); 386 } 387 return bestLength; 388 } 389 } 390 391 392 /** ZSTD_BtFindBestMatch() : Tree updater, providing best match */ 393 FORCE_INLINE_TEMPLATE 394 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 395 size_t ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms, 396 const BYTE* const ip, const BYTE* const iLimit, 397 size_t* offBasePtr, 398 const U32 mls /* template */, 399 const ZSTD_dictMode_e dictMode) 400 { 401 DEBUGLOG(7, "ZSTD_BtFindBestMatch"); 402 if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */ 403 ZSTD_updateDUBT(ms, ip, iLimit, mls); 404 return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode); 405 } 406 407 /*********************************** 408 * Dedicated dict search 409 ***********************************/ 410 411 void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip) 412 { 413 const BYTE* const base = ms->window.base; 414 U32 const target = (U32)(ip - base); 415 U32* const hashTable = ms->hashTable; 416 U32* const chainTable = ms->chainTable; 417 U32 const chainSize = 1 << ms->cParams.chainLog; 418 U32 idx = ms->nextToUpdate; 419 U32 const minChain = chainSize < target - idx ? target - chainSize : idx; 420 U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG; 421 U32 const cacheSize = bucketSize - 1; 422 U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize; 423 U32 const chainLimit = chainAttempts > 255 ? 255 : chainAttempts; 424 425 /* We know the hashtable is oversized by a factor of `bucketSize`. 426 * We are going to temporarily pretend `bucketSize == 1`, keeping only a 427 * single entry. We will use the rest of the space to construct a temporary 428 * chaintable. 429 */ 430 U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG; 431 U32* const tmpHashTable = hashTable; 432 U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog); 433 U32 const tmpChainSize = (U32)((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog; 434 U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx; 435 U32 hashIdx; 436 437 assert(ms->cParams.chainLog <= 24); 438 assert(ms->cParams.hashLog > ms->cParams.chainLog); 439 assert(idx != 0); 440 assert(tmpMinChain <= minChain); 441 442 /* fill conventional hash table and conventional chain table */ 443 for ( ; idx < target; idx++) { 444 U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch); 445 if (idx >= tmpMinChain) { 446 tmpChainTable[idx - tmpMinChain] = hashTable[h]; 447 } 448 tmpHashTable[h] = idx; 449 } 450 451 /* sort chains into ddss chain table */ 452 { 453 U32 chainPos = 0; 454 for (hashIdx = 0; hashIdx < (1U << hashLog); hashIdx++) { 455 U32 count; 456 U32 countBeyondMinChain = 0; 457 U32 i = tmpHashTable[hashIdx]; 458 for (count = 0; i >= tmpMinChain && count < cacheSize; count++) { 459 /* skip through the chain to the first position that won't be 460 * in the hash cache bucket */ 461 if (i < minChain) { 462 countBeyondMinChain++; 463 } 464 i = tmpChainTable[i - tmpMinChain]; 465 } 466 if (count == cacheSize) { 467 for (count = 0; count < chainLimit;) { 468 if (i < minChain) { 469 if (!i || ++countBeyondMinChain > cacheSize) { 470 /* only allow pulling `cacheSize` number of entries 471 * into the cache or chainTable beyond `minChain`, 472 * to replace the entries pulled out of the 473 * chainTable into the cache. This lets us reach 474 * back further without increasing the total number 475 * of entries in the chainTable, guaranteeing the 476 * DDSS chain table will fit into the space 477 * allocated for the regular one. */ 478 break; 479 } 480 } 481 chainTable[chainPos++] = i; 482 count++; 483 if (i < tmpMinChain) { 484 break; 485 } 486 i = tmpChainTable[i - tmpMinChain]; 487 } 488 } else { 489 count = 0; 490 } 491 if (count) { 492 tmpHashTable[hashIdx] = ((chainPos - count) << 8) + count; 493 } else { 494 tmpHashTable[hashIdx] = 0; 495 } 496 } 497 assert(chainPos <= chainSize); /* I believe this is guaranteed... */ 498 } 499 500 /* move chain pointers into the last entry of each hash bucket */ 501 for (hashIdx = (1 << hashLog); hashIdx; ) { 502 U32 const bucketIdx = --hashIdx << ZSTD_LAZY_DDSS_BUCKET_LOG; 503 U32 const chainPackedPointer = tmpHashTable[hashIdx]; 504 U32 i; 505 for (i = 0; i < cacheSize; i++) { 506 hashTable[bucketIdx + i] = 0; 507 } 508 hashTable[bucketIdx + bucketSize - 1] = chainPackedPointer; 509 } 510 511 /* fill the buckets of the hash table */ 512 for (idx = ms->nextToUpdate; idx < target; idx++) { 513 U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch) 514 << ZSTD_LAZY_DDSS_BUCKET_LOG; 515 U32 i; 516 /* Shift hash cache down 1. */ 517 for (i = cacheSize - 1; i; i--) 518 hashTable[h + i] = hashTable[h + i - 1]; 519 hashTable[h] = idx; 520 } 521 522 ms->nextToUpdate = target; 523 } 524 525 /* Returns the longest match length found in the dedicated dict search structure. 526 * If none are longer than the argument ml, then ml will be returned. 527 */ 528 FORCE_INLINE_TEMPLATE 529 size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts, 530 const ZSTD_matchState_t* const dms, 531 const BYTE* const ip, const BYTE* const iLimit, 532 const BYTE* const prefixStart, const U32 curr, 533 const U32 dictLimit, const size_t ddsIdx) { 534 const U32 ddsLowestIndex = dms->window.dictLimit; 535 const BYTE* const ddsBase = dms->window.base; 536 const BYTE* const ddsEnd = dms->window.nextSrc; 537 const U32 ddsSize = (U32)(ddsEnd - ddsBase); 538 const U32 ddsIndexDelta = dictLimit - ddsSize; 539 const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG); 540 const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1; 541 U32 ddsAttempt; 542 U32 matchIndex; 543 544 for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) { 545 PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]); 546 } 547 548 { 549 U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1]; 550 U32 const chainIndex = chainPackedPointer >> 8; 551 552 PREFETCH_L1(&dms->chainTable[chainIndex]); 553 } 554 555 for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) { 556 size_t currentMl=0; 557 const BYTE* match; 558 matchIndex = dms->hashTable[ddsIdx + ddsAttempt]; 559 match = ddsBase + matchIndex; 560 561 if (!matchIndex) { 562 return ml; 563 } 564 565 /* guaranteed by table construction */ 566 (void)ddsLowestIndex; 567 assert(matchIndex >= ddsLowestIndex); 568 assert(match+4 <= ddsEnd); 569 if (MEM_read32(match) == MEM_read32(ip)) { 570 /* assumption : matchIndex <= dictLimit-4 (by table construction) */ 571 currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4; 572 } 573 574 /* save best solution */ 575 if (currentMl > ml) { 576 ml = currentMl; 577 *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta)); 578 if (ip+currentMl == iLimit) { 579 /* best possible, avoids read overflow on next attempt */ 580 return ml; 581 } 582 } 583 } 584 585 { 586 U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1]; 587 U32 chainIndex = chainPackedPointer >> 8; 588 U32 const chainLength = chainPackedPointer & 0xFF; 589 U32 const chainAttempts = nbAttempts - ddsAttempt; 590 U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts; 591 U32 chainAttempt; 592 593 for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) { 594 PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]); 595 } 596 597 for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) { 598 size_t currentMl=0; 599 const BYTE* match; 600 matchIndex = dms->chainTable[chainIndex]; 601 match = ddsBase + matchIndex; 602 603 /* guaranteed by table construction */ 604 assert(matchIndex >= ddsLowestIndex); 605 assert(match+4 <= ddsEnd); 606 if (MEM_read32(match) == MEM_read32(ip)) { 607 /* assumption : matchIndex <= dictLimit-4 (by table construction) */ 608 currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4; 609 } 610 611 /* save best solution */ 612 if (currentMl > ml) { 613 ml = currentMl; 614 *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta)); 615 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ 616 } 617 } 618 } 619 return ml; 620 } 621 622 623 /* ********************************* 624 * Hash Chain 625 ***********************************/ 626 #define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)] 627 628 /* Update chains up to ip (excluded) 629 Assumption : always within prefix (i.e. not within extDict) */ 630 FORCE_INLINE_TEMPLATE 631 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 632 U32 ZSTD_insertAndFindFirstIndex_internal( 633 ZSTD_matchState_t* ms, 634 const ZSTD_compressionParameters* const cParams, 635 const BYTE* ip, U32 const mls, U32 const lazySkipping) 636 { 637 U32* const hashTable = ms->hashTable; 638 const U32 hashLog = cParams->hashLog; 639 U32* const chainTable = ms->chainTable; 640 const U32 chainMask = (1 << cParams->chainLog) - 1; 641 const BYTE* const base = ms->window.base; 642 const U32 target = (U32)(ip - base); 643 U32 idx = ms->nextToUpdate; 644 645 while(idx < target) { /* catch up */ 646 size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls); 647 NEXT_IN_CHAIN(idx, chainMask) = hashTable[h]; 648 hashTable[h] = idx; 649 idx++; 650 /* Stop inserting every position when in the lazy skipping mode. */ 651 if (lazySkipping) 652 break; 653 } 654 655 ms->nextToUpdate = target; 656 return hashTable[ZSTD_hashPtr(ip, hashLog, mls)]; 657 } 658 659 U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) { 660 const ZSTD_compressionParameters* const cParams = &ms->cParams; 661 return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0); 662 } 663 664 /* inlining is important to hardwire a hot branch (template emulation) */ 665 FORCE_INLINE_TEMPLATE 666 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 667 size_t ZSTD_HcFindBestMatch( 668 ZSTD_matchState_t* ms, 669 const BYTE* const ip, const BYTE* const iLimit, 670 size_t* offsetPtr, 671 const U32 mls, const ZSTD_dictMode_e dictMode) 672 { 673 const ZSTD_compressionParameters* const cParams = &ms->cParams; 674 U32* const chainTable = ms->chainTable; 675 const U32 chainSize = (1 << cParams->chainLog); 676 const U32 chainMask = chainSize-1; 677 const BYTE* const base = ms->window.base; 678 const BYTE* const dictBase = ms->window.dictBase; 679 const U32 dictLimit = ms->window.dictLimit; 680 const BYTE* const prefixStart = base + dictLimit; 681 const BYTE* const dictEnd = dictBase + dictLimit; 682 const U32 curr = (U32)(ip-base); 683 const U32 maxDistance = 1U << cParams->windowLog; 684 const U32 lowestValid = ms->window.lowLimit; 685 const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid; 686 const U32 isDictionary = (ms->loadedDictEnd != 0); 687 const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance; 688 const U32 minChain = curr > chainSize ? curr - chainSize : 0; 689 U32 nbAttempts = 1U << cParams->searchLog; 690 size_t ml=4-1; 691 692 const ZSTD_matchState_t* const dms = ms->dictMatchState; 693 const U32 ddsHashLog = dictMode == ZSTD_dedicatedDictSearch 694 ? dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG : 0; 695 const size_t ddsIdx = dictMode == ZSTD_dedicatedDictSearch 696 ? ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG : 0; 697 698 U32 matchIndex; 699 700 if (dictMode == ZSTD_dedicatedDictSearch) { 701 const U32* entry = &dms->hashTable[ddsIdx]; 702 PREFETCH_L1(entry); 703 } 704 705 /* HC4 match finder */ 706 matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping); 707 708 for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) { 709 size_t currentMl=0; 710 if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { 711 const BYTE* const match = base + matchIndex; 712 assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ 713 /* read 4B starting from (match + ml + 1 - sizeof(U32)) */ 714 if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */ 715 currentMl = ZSTD_count(ip, match, iLimit); 716 } else { 717 const BYTE* const match = dictBase + matchIndex; 718 assert(match+4 <= dictEnd); 719 if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */ 720 currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4; 721 } 722 723 /* save best solution */ 724 if (currentMl > ml) { 725 ml = currentMl; 726 *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); 727 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ 728 } 729 730 if (matchIndex <= minChain) break; 731 matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask); 732 } 733 734 assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */ 735 if (dictMode == ZSTD_dedicatedDictSearch) { 736 ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms, 737 ip, iLimit, prefixStart, curr, dictLimit, ddsIdx); 738 } else if (dictMode == ZSTD_dictMatchState) { 739 const U32* const dmsChainTable = dms->chainTable; 740 const U32 dmsChainSize = (1 << dms->cParams.chainLog); 741 const U32 dmsChainMask = dmsChainSize - 1; 742 const U32 dmsLowestIndex = dms->window.dictLimit; 743 const BYTE* const dmsBase = dms->window.base; 744 const BYTE* const dmsEnd = dms->window.nextSrc; 745 const U32 dmsSize = (U32)(dmsEnd - dmsBase); 746 const U32 dmsIndexDelta = dictLimit - dmsSize; 747 const U32 dmsMinChain = dmsSize > dmsChainSize ? dmsSize - dmsChainSize : 0; 748 749 matchIndex = dms->hashTable[ZSTD_hashPtr(ip, dms->cParams.hashLog, mls)]; 750 751 for ( ; (matchIndex>=dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) { 752 size_t currentMl=0; 753 const BYTE* const match = dmsBase + matchIndex; 754 assert(match+4 <= dmsEnd); 755 if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */ 756 currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4; 757 758 /* save best solution */ 759 if (currentMl > ml) { 760 ml = currentMl; 761 assert(curr > matchIndex + dmsIndexDelta); 762 *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta)); 763 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ 764 } 765 766 if (matchIndex <= dmsMinChain) break; 767 768 matchIndex = dmsChainTable[matchIndex & dmsChainMask]; 769 } 770 } 771 772 return ml; 773 } 774 775 /* ********************************* 776 * (SIMD) Row-based matchfinder 777 ***********************************/ 778 /* Constants for row-based hash */ 779 #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1) 780 #define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */ 781 782 #define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1) 783 784 typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 representing a mask of matches */ 785 786 /* ZSTD_VecMask_next(): 787 * Starting from the LSB, returns the idx of the next non-zero bit. 788 * Basically counting the nb of trailing zeroes. 789 */ 790 MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) { 791 return ZSTD_countTrailingZeros64(val); 792 } 793 794 /* ZSTD_row_nextIndex(): 795 * Returns the next index to insert at within a tagTable row, and updates the "head" 796 * value to reflect the update. Essentially cycles backwards from [1, {entries per row}) 797 */ 798 FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) { 799 U32 next = (*tagRow-1) & rowMask; 800 next += (next == 0) ? rowMask : 0; /* skip first position */ 801 *tagRow = (BYTE)next; 802 return next; 803 } 804 805 /* ZSTD_isAligned(): 806 * Checks that a pointer is aligned to "align" bytes which must be a power of 2. 807 */ 808 MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) { 809 assert((align & (align - 1)) == 0); 810 return (((size_t)ptr) & (align - 1)) == 0; 811 } 812 813 /* ZSTD_row_prefetch(): 814 * Performs prefetching for the hashTable and tagTable at a given row. 815 */ 816 FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) { 817 PREFETCH_L1(hashTable + relRow); 818 if (rowLog >= 5) { 819 PREFETCH_L1(hashTable + relRow + 16); 820 /* Note: prefetching more of the hash table does not appear to be beneficial for 128-entry rows */ 821 } 822 PREFETCH_L1(tagTable + relRow); 823 if (rowLog == 6) { 824 PREFETCH_L1(tagTable + relRow + 32); 825 } 826 assert(rowLog == 4 || rowLog == 5 || rowLog == 6); 827 assert(ZSTD_isAligned(hashTable + relRow, 64)); /* prefetched hash row always 64-byte aligned */ 828 assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on correct multiple of bytes (32,64,128) */ 829 } 830 831 /* ZSTD_row_fillHashCache(): 832 * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries, 833 * but not beyond iLimit. 834 */ 835 FORCE_INLINE_TEMPLATE 836 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 837 void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base, 838 U32 const rowLog, U32 const mls, 839 U32 idx, const BYTE* const iLimit) 840 { 841 U32 const* const hashTable = ms->hashTable; 842 BYTE const* const tagTable = ms->tagTable; 843 U32 const hashLog = ms->rowHashLog; 844 U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1); 845 U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch); 846 847 for (; idx < lim; ++idx) { 848 U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt); 849 U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; 850 ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); 851 ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash; 852 } 853 854 DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[0], ms->hashCache[1], 855 ms->hashCache[2], ms->hashCache[3], ms->hashCache[4], 856 ms->hashCache[5], ms->hashCache[6], ms->hashCache[7]); 857 } 858 859 /* ZSTD_row_nextCachedHash(): 860 * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at 861 * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable. 862 */ 863 FORCE_INLINE_TEMPLATE 864 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 865 U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable, 866 BYTE const* tagTable, BYTE const* base, 867 U32 idx, U32 const hashLog, 868 U32 const rowLog, U32 const mls, 869 U64 const hashSalt) 870 { 871 U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt); 872 U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; 873 ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); 874 { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK]; 875 cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash; 876 return hash; 877 } 878 } 879 880 /* ZSTD_row_update_internalImpl(): 881 * Updates the hash table with positions starting from updateStartIdx until updateEndIdx. 882 */ 883 FORCE_INLINE_TEMPLATE 884 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 885 void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, 886 U32 updateStartIdx, U32 const updateEndIdx, 887 U32 const mls, U32 const rowLog, 888 U32 const rowMask, U32 const useCache) 889 { 890 U32* const hashTable = ms->hashTable; 891 BYTE* const tagTable = ms->tagTable; 892 U32 const hashLog = ms->rowHashLog; 893 const BYTE* const base = ms->window.base; 894 895 DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx); 896 for (; updateStartIdx < updateEndIdx; ++updateStartIdx) { 897 U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt) 898 : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt); 899 U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; 900 U32* const row = hashTable + relRow; 901 BYTE* tagRow = tagTable + relRow; 902 U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); 903 904 assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt)); 905 tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK; 906 row[pos] = updateStartIdx; 907 } 908 } 909 910 /* ZSTD_row_update_internal(): 911 * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate. 912 * Skips sections of long matches as is necessary. 913 */ 914 FORCE_INLINE_TEMPLATE 915 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 916 void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip, 917 U32 const mls, U32 const rowLog, 918 U32 const rowMask, U32 const useCache) 919 { 920 U32 idx = ms->nextToUpdate; 921 const BYTE* const base = ms->window.base; 922 const U32 target = (U32)(ip - base); 923 const U32 kSkipThreshold = 384; 924 const U32 kMaxMatchStartPositionsToUpdate = 96; 925 const U32 kMaxMatchEndPositionsToUpdate = 32; 926 927 if (useCache) { 928 /* Only skip positions when using hash cache, i.e. 929 * if we are loading a dict, don't skip anything. 930 * If we decide to skip, then we only update a set number 931 * of positions at the beginning and end of the match. 932 */ 933 if (UNLIKELY(target - idx > kSkipThreshold)) { 934 U32 const bound = idx + kMaxMatchStartPositionsToUpdate; 935 ZSTD_row_update_internalImpl(ms, idx, bound, mls, rowLog, rowMask, useCache); 936 idx = target - kMaxMatchEndPositionsToUpdate; 937 ZSTD_row_fillHashCache(ms, base, rowLog, mls, idx, ip+1); 938 } 939 } 940 assert(target >= idx); 941 ZSTD_row_update_internalImpl(ms, idx, target, mls, rowLog, rowMask, useCache); 942 ms->nextToUpdate = target; 943 } 944 945 /* ZSTD_row_update(): 946 * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary 947 * processing. 948 */ 949 void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) { 950 const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); 951 const U32 rowMask = (1u << rowLog) - 1; 952 const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */); 953 954 DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog); 955 ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */); 956 } 957 958 /* Returns the mask width of bits group of which will be set to 1. Given not all 959 * architectures have easy movemask instruction, this helps to iterate over 960 * groups of bits easier and faster. 961 */ 962 FORCE_INLINE_TEMPLATE U32 963 ZSTD_row_matchMaskGroupWidth(const U32 rowEntries) 964 { 965 assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); 966 assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); 967 (void)rowEntries; 968 #if defined(ZSTD_ARCH_ARM_NEON) 969 /* NEON path only works for little endian */ 970 if (!MEM_isLittleEndian()) { 971 return 1; 972 } 973 if (rowEntries == 16) { 974 return 4; 975 } 976 if (rowEntries == 32) { 977 return 2; 978 } 979 if (rowEntries == 64) { 980 return 1; 981 } 982 #endif 983 return 1; 984 } 985 986 #if defined(ZSTD_ARCH_X86_SSE2) 987 FORCE_INLINE_TEMPLATE ZSTD_VecMask 988 ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U32 head) 989 { 990 const __m128i comparisonMask = _mm_set1_epi8((char)tag); 991 int matches[4] = {0}; 992 int i; 993 assert(nbChunks == 1 || nbChunks == 2 || nbChunks == 4); 994 for (i=0; i<nbChunks; i++) { 995 const __m128i chunk = _mm_loadu_si128((const __m128i*)(const void*)(src + 16*i)); 996 const __m128i equalMask = _mm_cmpeq_epi8(chunk, comparisonMask); 997 matches[i] = _mm_movemask_epi8(equalMask); 998 } 999 if (nbChunks == 1) return ZSTD_rotateRight_U16((U16)matches[0], head); 1000 if (nbChunks == 2) return ZSTD_rotateRight_U32((U32)matches[1] << 16 | (U32)matches[0], head); 1001 assert(nbChunks == 4); 1002 return ZSTD_rotateRight_U64((U64)matches[3] << 48 | (U64)matches[2] << 32 | (U64)matches[1] << 16 | (U64)matches[0], head); 1003 } 1004 #endif 1005 1006 #if defined(ZSTD_ARCH_ARM_NEON) 1007 FORCE_INLINE_TEMPLATE ZSTD_VecMask 1008 ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped) 1009 { 1010 assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); 1011 if (rowEntries == 16) { 1012 /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits. 1013 * After that groups of 4 bits represent the equalMask. We lower 1014 * all bits except the highest in these groups by doing AND with 1015 * 0x88 = 0b10001000. 1016 */ 1017 const uint8x16_t chunk = vld1q_u8(src); 1018 const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag))); 1019 const uint8x8_t res = vshrn_n_u16(equalMask, 4); 1020 const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0); 1021 return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull; 1022 } else if (rowEntries == 32) { 1023 /* Same idea as with rowEntries == 16 but doing AND with 1024 * 0x55 = 0b01010101. 1025 */ 1026 const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src); 1027 const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]); 1028 const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]); 1029 const uint8x16_t dup = vdupq_n_u8(tag); 1030 const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6); 1031 const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6); 1032 const uint8x8_t res = vsli_n_u8(t0, t1, 4); 1033 const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ; 1034 return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull; 1035 } else { /* rowEntries == 64 */ 1036 const uint8x16x4_t chunk = vld4q_u8(src); 1037 const uint8x16_t dup = vdupq_n_u8(tag); 1038 const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup); 1039 const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup); 1040 const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup); 1041 const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup); 1042 1043 const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1); 1044 const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1); 1045 const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2); 1046 const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4); 1047 const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4); 1048 const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0); 1049 return ZSTD_rotateRight_U64(matches, headGrouped); 1050 } 1051 } 1052 #endif 1053 1054 /* Returns a ZSTD_VecMask (U64) that has the nth group (determined by 1055 * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag" 1056 * matches the hash at the nth position in a row of the tagTable. 1057 * Each row is a circular buffer beginning at the value of "headGrouped". So we 1058 * must rotate the "matches" bitfield to match up with the actual layout of the 1059 * entries within the hashTable */ 1060 FORCE_INLINE_TEMPLATE ZSTD_VecMask 1061 ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries) 1062 { 1063 const BYTE* const src = tagRow; 1064 assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); 1065 assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); 1066 assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8); 1067 1068 #if defined(ZSTD_ARCH_X86_SSE2) 1069 1070 return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped); 1071 1072 #else /* SW or NEON-LE */ 1073 1074 # if defined(ZSTD_ARCH_ARM_NEON) 1075 /* This NEON path only works for little endian - otherwise use SWAR below */ 1076 if (MEM_isLittleEndian()) { 1077 return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped); 1078 } 1079 # endif /* ZSTD_ARCH_ARM_NEON */ 1080 /* SWAR */ 1081 { const int chunkSize = sizeof(size_t); 1082 const size_t shiftAmount = ((chunkSize * 8) - chunkSize); 1083 const size_t xFF = ~((size_t)0); 1084 const size_t x01 = xFF / 0xFF; 1085 const size_t x80 = x01 << 7; 1086 const size_t splatChar = tag * x01; 1087 ZSTD_VecMask matches = 0; 1088 int i = rowEntries - chunkSize; 1089 assert((sizeof(size_t) == 4) || (sizeof(size_t) == 8)); 1090 if (MEM_isLittleEndian()) { /* runtime check so have two loops */ 1091 const size_t extractMagic = (xFF / 0x7F) >> chunkSize; 1092 do { 1093 size_t chunk = MEM_readST(&src[i]); 1094 chunk ^= splatChar; 1095 chunk = (((chunk | x80) - x01) | chunk) & x80; 1096 matches <<= chunkSize; 1097 matches |= (chunk * extractMagic) >> shiftAmount; 1098 i -= chunkSize; 1099 } while (i >= 0); 1100 } else { /* big endian: reverse bits during extraction */ 1101 const size_t msb = xFF ^ (xFF >> 1); 1102 const size_t extractMagic = (msb / 0x1FF) | msb; 1103 do { 1104 size_t chunk = MEM_readST(&src[i]); 1105 chunk ^= splatChar; 1106 chunk = (((chunk | x80) - x01) | chunk) & x80; 1107 matches <<= chunkSize; 1108 matches |= ((chunk >> 7) * extractMagic) >> shiftAmount; 1109 i -= chunkSize; 1110 } while (i >= 0); 1111 } 1112 matches = ~matches; 1113 if (rowEntries == 16) { 1114 return ZSTD_rotateRight_U16((U16)matches, headGrouped); 1115 } else if (rowEntries == 32) { 1116 return ZSTD_rotateRight_U32((U32)matches, headGrouped); 1117 } else { 1118 return ZSTD_rotateRight_U64((U64)matches, headGrouped); 1119 } 1120 } 1121 #endif 1122 } 1123 1124 /* The high-level approach of the SIMD row based match finder is as follows: 1125 * - Figure out where to insert the new entry: 1126 * - Generate a hash for current input posistion and split it into a one byte of tag and `rowHashLog` bits of index. 1127 * - The hash is salted by a value that changes on every contex reset, so when the same table is used 1128 * we will avoid collisions that would otherwise slow us down by intorducing phantom matches. 1129 * - The hashTable is effectively split into groups or "rows" of 15 or 31 entries of U32, and the index determines 1130 * which row to insert into. 1131 * - Determine the correct position within the row to insert the entry into. Each row of 15 or 31 can 1132 * be considered as a circular buffer with a "head" index that resides in the tagTable (overall 16 or 32 bytes 1133 * per row). 1134 * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte tag calculated for the position and 1135 * generate a bitfield that we can cycle through to check the collisions in the hash table. 1136 * - Pick the longest match. 1137 * - Insert the tag into the equivalent row and position in the tagTable. 1138 */ 1139 FORCE_INLINE_TEMPLATE 1140 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 1141 size_t ZSTD_RowFindBestMatch( 1142 ZSTD_matchState_t* ms, 1143 const BYTE* const ip, const BYTE* const iLimit, 1144 size_t* offsetPtr, 1145 const U32 mls, const ZSTD_dictMode_e dictMode, 1146 const U32 rowLog) 1147 { 1148 U32* const hashTable = ms->hashTable; 1149 BYTE* const tagTable = ms->tagTable; 1150 U32* const hashCache = ms->hashCache; 1151 const U32 hashLog = ms->rowHashLog; 1152 const ZSTD_compressionParameters* const cParams = &ms->cParams; 1153 const BYTE* const base = ms->window.base; 1154 const BYTE* const dictBase = ms->window.dictBase; 1155 const U32 dictLimit = ms->window.dictLimit; 1156 const BYTE* const prefixStart = base + dictLimit; 1157 const BYTE* const dictEnd = dictBase + dictLimit; 1158 const U32 curr = (U32)(ip-base); 1159 const U32 maxDistance = 1U << cParams->windowLog; 1160 const U32 lowestValid = ms->window.lowLimit; 1161 const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid; 1162 const U32 isDictionary = (ms->loadedDictEnd != 0); 1163 const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance; 1164 const U32 rowEntries = (1U << rowLog); 1165 const U32 rowMask = rowEntries - 1; 1166 const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */ 1167 const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries); 1168 const U64 hashSalt = ms->hashSalt; 1169 U32 nbAttempts = 1U << cappedSearchLog; 1170 size_t ml=4-1; 1171 U32 hash; 1172 1173 /* DMS/DDS variables that may be referenced laster */ 1174 const ZSTD_matchState_t* const dms = ms->dictMatchState; 1175 1176 /* Initialize the following variables to satisfy static analyzer */ 1177 size_t ddsIdx = 0; 1178 U32 ddsExtraAttempts = 0; /* cctx hash tables are limited in searches, but allow extra searches into DDS */ 1179 U32 dmsTag = 0; 1180 U32* dmsRow = NULL; 1181 BYTE* dmsTagRow = NULL; 1182 1183 if (dictMode == ZSTD_dedicatedDictSearch) { 1184 const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG; 1185 { /* Prefetch DDS hashtable entry */ 1186 ddsIdx = ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG; 1187 PREFETCH_L1(&dms->hashTable[ddsIdx]); 1188 } 1189 ddsExtraAttempts = cParams->searchLog > rowLog ? 1U << (cParams->searchLog - rowLog) : 0; 1190 } 1191 1192 if (dictMode == ZSTD_dictMatchState) { 1193 /* Prefetch DMS rows */ 1194 U32* const dmsHashTable = dms->hashTable; 1195 BYTE* const dmsTagTable = dms->tagTable; 1196 U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls); 1197 U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; 1198 dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK; 1199 dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow); 1200 dmsRow = dmsHashTable + dmsRelRow; 1201 ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog); 1202 } 1203 1204 /* Update the hashTable and tagTable up to (but not including) ip */ 1205 if (!ms->lazySkipping) { 1206 ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */); 1207 hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt); 1208 } else { 1209 /* Stop inserting every position when in the lazy skipping mode. 1210 * The hash cache is also not kept up to date in this mode. 1211 */ 1212 hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt); 1213 ms->nextToUpdate = curr; 1214 } 1215 ms->hashSaltEntropy += hash; /* collect salt entropy */ 1216 1217 { /* Get the hash for ip, compute the appropriate row */ 1218 U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; 1219 U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK; 1220 U32* const row = hashTable + relRow; 1221 BYTE* tagRow = (BYTE*)(tagTable + relRow); 1222 U32 const headGrouped = (*tagRow & rowMask) * groupWidth; 1223 U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; 1224 size_t numMatches = 0; 1225 size_t currMatch = 0; 1226 ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries); 1227 1228 /* Cycle through the matches and prefetch */ 1229 for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) { 1230 U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask; 1231 U32 const matchIndex = row[matchPos]; 1232 if(matchPos == 0) continue; 1233 assert(numMatches < rowEntries); 1234 if (matchIndex < lowLimit) 1235 break; 1236 if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { 1237 PREFETCH_L1(base + matchIndex); 1238 } else { 1239 PREFETCH_L1(dictBase + matchIndex); 1240 } 1241 matchBuffer[numMatches++] = matchIndex; 1242 --nbAttempts; 1243 } 1244 1245 /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop 1246 in ZSTD_row_update_internal() at the next search. */ 1247 { 1248 U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); 1249 tagRow[pos] = (BYTE)tag; 1250 row[pos] = ms->nextToUpdate++; 1251 } 1252 1253 /* Return the longest match */ 1254 for (; currMatch < numMatches; ++currMatch) { 1255 U32 const matchIndex = matchBuffer[currMatch]; 1256 size_t currentMl=0; 1257 assert(matchIndex < curr); 1258 assert(matchIndex >= lowLimit); 1259 1260 if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { 1261 const BYTE* const match = base + matchIndex; 1262 assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ 1263 /* read 4B starting from (match + ml + 1 - sizeof(U32)) */ 1264 if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */ 1265 currentMl = ZSTD_count(ip, match, iLimit); 1266 } else { 1267 const BYTE* const match = dictBase + matchIndex; 1268 assert(match+4 <= dictEnd); 1269 if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */ 1270 currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4; 1271 } 1272 1273 /* Save best solution */ 1274 if (currentMl > ml) { 1275 ml = currentMl; 1276 *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); 1277 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ 1278 } 1279 } 1280 } 1281 1282 assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */ 1283 if (dictMode == ZSTD_dedicatedDictSearch) { 1284 ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms, 1285 ip, iLimit, prefixStart, curr, dictLimit, ddsIdx); 1286 } else if (dictMode == ZSTD_dictMatchState) { 1287 /* TODO: Measure and potentially add prefetching to DMS */ 1288 const U32 dmsLowestIndex = dms->window.dictLimit; 1289 const BYTE* const dmsBase = dms->window.base; 1290 const BYTE* const dmsEnd = dms->window.nextSrc; 1291 const U32 dmsSize = (U32)(dmsEnd - dmsBase); 1292 const U32 dmsIndexDelta = dictLimit - dmsSize; 1293 1294 { U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth; 1295 U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; 1296 size_t numMatches = 0; 1297 size_t currMatch = 0; 1298 ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries); 1299 1300 for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) { 1301 U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask; 1302 U32 const matchIndex = dmsRow[matchPos]; 1303 if(matchPos == 0) continue; 1304 if (matchIndex < dmsLowestIndex) 1305 break; 1306 PREFETCH_L1(dmsBase + matchIndex); 1307 matchBuffer[numMatches++] = matchIndex; 1308 --nbAttempts; 1309 } 1310 1311 /* Return the longest match */ 1312 for (; currMatch < numMatches; ++currMatch) { 1313 U32 const matchIndex = matchBuffer[currMatch]; 1314 size_t currentMl=0; 1315 assert(matchIndex >= dmsLowestIndex); 1316 assert(matchIndex < curr); 1317 1318 { const BYTE* const match = dmsBase + matchIndex; 1319 assert(match+4 <= dmsEnd); 1320 if (MEM_read32(match) == MEM_read32(ip)) 1321 currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4; 1322 } 1323 1324 if (currentMl > ml) { 1325 ml = currentMl; 1326 assert(curr > matchIndex + dmsIndexDelta); 1327 *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta)); 1328 if (ip+currentMl == iLimit) break; 1329 } 1330 } 1331 } 1332 } 1333 return ml; 1334 } 1335 1336 1337 /** 1338 * Generate search functions templated on (dictMode, mls, rowLog). 1339 * These functions are outlined for code size & compilation time. 1340 * ZSTD_searchMax() dispatches to the correct implementation function. 1341 * 1342 * TODO: The start of the search function involves loading and calculating a 1343 * bunch of constants from the ZSTD_matchState_t. These computations could be 1344 * done in an initialization function, and saved somewhere in the match state. 1345 * Then we could pass a pointer to the saved state instead of the match state, 1346 * and avoid duplicate computations. 1347 * 1348 * TODO: Move the match re-winding into searchMax. This improves compression 1349 * ratio, and unlocks further simplifications with the next TODO. 1350 * 1351 * TODO: Try moving the repcode search into searchMax. After the re-winding 1352 * and repcode search are in searchMax, there is no more logic in the match 1353 * finder loop that requires knowledge about the dictMode. So we should be 1354 * able to avoid force inlining it, and we can join the extDict loop with 1355 * the single segment loop. It should go in searchMax instead of its own 1356 * function to avoid having multiple virtual function calls per search. 1357 */ 1358 1359 #define ZSTD_BT_SEARCH_FN(dictMode, mls) ZSTD_BtFindBestMatch_##dictMode##_##mls 1360 #define ZSTD_HC_SEARCH_FN(dictMode, mls) ZSTD_HcFindBestMatch_##dictMode##_##mls 1361 #define ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog 1362 1363 #define ZSTD_SEARCH_FN_ATTRS FORCE_NOINLINE 1364 1365 #define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls) \ 1366 ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)( \ 1367 ZSTD_matchState_t* ms, \ 1368 const BYTE* ip, const BYTE* const iLimit, \ 1369 size_t* offBasePtr) \ 1370 { \ 1371 assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \ 1372 return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_##dictMode); \ 1373 } \ 1374 1375 #define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls) \ 1376 ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)( \ 1377 ZSTD_matchState_t* ms, \ 1378 const BYTE* ip, const BYTE* const iLimit, \ 1379 size_t* offsetPtr) \ 1380 { \ 1381 assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \ 1382 return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \ 1383 } \ 1384 1385 #define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) \ 1386 ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)( \ 1387 ZSTD_matchState_t* ms, \ 1388 const BYTE* ip, const BYTE* const iLimit, \ 1389 size_t* offsetPtr) \ 1390 { \ 1391 assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \ 1392 assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog); \ 1393 return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \ 1394 } \ 1395 1396 #define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \ 1397 X(dictMode, mls, 4) \ 1398 X(dictMode, mls, 5) \ 1399 X(dictMode, mls, 6) 1400 1401 #define ZSTD_FOR_EACH_MLS_ROWLOG(X, dictMode) \ 1402 ZSTD_FOR_EACH_ROWLOG(X, dictMode, 4) \ 1403 ZSTD_FOR_EACH_ROWLOG(X, dictMode, 5) \ 1404 ZSTD_FOR_EACH_ROWLOG(X, dictMode, 6) 1405 1406 #define ZSTD_FOR_EACH_MLS(X, dictMode) \ 1407 X(dictMode, 4) \ 1408 X(dictMode, 5) \ 1409 X(dictMode, 6) 1410 1411 #define ZSTD_FOR_EACH_DICT_MODE(X, ...) \ 1412 X(__VA_ARGS__, noDict) \ 1413 X(__VA_ARGS__, extDict) \ 1414 X(__VA_ARGS__, dictMatchState) \ 1415 X(__VA_ARGS__, dedicatedDictSearch) 1416 1417 /* Generate row search fns for each combination of (dictMode, mls, rowLog) */ 1418 ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_SEARCH_FN) 1419 /* Generate binary Tree search fns for each combination of (dictMode, mls) */ 1420 ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_SEARCH_FN) 1421 /* Generate hash chain search fns for each combination of (dictMode, mls) */ 1422 ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_SEARCH_FN) 1423 1424 typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e; 1425 1426 #define GEN_ZSTD_CALL_BT_SEARCH_FN(dictMode, mls) \ 1427 case mls: \ 1428 return ZSTD_BT_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr); 1429 #define GEN_ZSTD_CALL_HC_SEARCH_FN(dictMode, mls) \ 1430 case mls: \ 1431 return ZSTD_HC_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr); 1432 #define GEN_ZSTD_CALL_ROW_SEARCH_FN(dictMode, mls, rowLog) \ 1433 case rowLog: \ 1434 return ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(ms, ip, iend, offsetPtr); 1435 1436 #define ZSTD_SWITCH_MLS(X, dictMode) \ 1437 switch (mls) { \ 1438 ZSTD_FOR_EACH_MLS(X, dictMode) \ 1439 } 1440 1441 #define ZSTD_SWITCH_ROWLOG(dictMode, mls) \ 1442 case mls: \ 1443 switch (rowLog) { \ 1444 ZSTD_FOR_EACH_ROWLOG(GEN_ZSTD_CALL_ROW_SEARCH_FN, dictMode, mls) \ 1445 } \ 1446 ZSTD_UNREACHABLE; \ 1447 break; 1448 1449 #define ZSTD_SWITCH_SEARCH_METHOD(dictMode) \ 1450 switch (searchMethod) { \ 1451 case search_hashChain: \ 1452 ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_HC_SEARCH_FN, dictMode) \ 1453 break; \ 1454 case search_binaryTree: \ 1455 ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_BT_SEARCH_FN, dictMode) \ 1456 break; \ 1457 case search_rowHash: \ 1458 ZSTD_SWITCH_MLS(ZSTD_SWITCH_ROWLOG, dictMode) \ 1459 break; \ 1460 } \ 1461 ZSTD_UNREACHABLE; 1462 1463 /** 1464 * Searches for the longest match at @p ip. 1465 * Dispatches to the correct implementation function based on the 1466 * (searchMethod, dictMode, mls, rowLog). We use switch statements 1467 * here instead of using an indirect function call through a function 1468 * pointer because after Spectre and Meltdown mitigations, indirect 1469 * function calls can be very costly, especially in the kernel. 1470 * 1471 * NOTE: dictMode and searchMethod should be templated, so those switch 1472 * statements should be optimized out. Only the mls & rowLog switches 1473 * should be left. 1474 * 1475 * @param ms The match state. 1476 * @param ip The position to search at. 1477 * @param iend The end of the input data. 1478 * @param[out] offsetPtr Stores the match offset into this pointer. 1479 * @param mls The minimum search length, in the range [4, 6]. 1480 * @param rowLog The row log (if applicable), in the range [4, 6]. 1481 * @param searchMethod The search method to use (templated). 1482 * @param dictMode The dictMode (templated). 1483 * 1484 * @returns The length of the longest match found, or < mls if no match is found. 1485 * If a match is found its offset is stored in @p offsetPtr. 1486 */ 1487 FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax( 1488 ZSTD_matchState_t* ms, 1489 const BYTE* ip, 1490 const BYTE* iend, 1491 size_t* offsetPtr, 1492 U32 const mls, 1493 U32 const rowLog, 1494 searchMethod_e const searchMethod, 1495 ZSTD_dictMode_e const dictMode) 1496 { 1497 if (dictMode == ZSTD_noDict) { 1498 ZSTD_SWITCH_SEARCH_METHOD(noDict) 1499 } else if (dictMode == ZSTD_extDict) { 1500 ZSTD_SWITCH_SEARCH_METHOD(extDict) 1501 } else if (dictMode == ZSTD_dictMatchState) { 1502 ZSTD_SWITCH_SEARCH_METHOD(dictMatchState) 1503 } else if (dictMode == ZSTD_dedicatedDictSearch) { 1504 ZSTD_SWITCH_SEARCH_METHOD(dedicatedDictSearch) 1505 } 1506 ZSTD_UNREACHABLE; 1507 return 0; 1508 } 1509 1510 /* ******************************* 1511 * Common parser - lazy strategy 1512 *********************************/ 1513 1514 FORCE_INLINE_TEMPLATE 1515 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 1516 size_t ZSTD_compressBlock_lazy_generic( 1517 ZSTD_matchState_t* ms, seqStore_t* seqStore, 1518 U32 rep[ZSTD_REP_NUM], 1519 const void* src, size_t srcSize, 1520 const searchMethod_e searchMethod, const U32 depth, 1521 ZSTD_dictMode_e const dictMode) 1522 { 1523 const BYTE* const istart = (const BYTE*)src; 1524 const BYTE* ip = istart; 1525 const BYTE* anchor = istart; 1526 const BYTE* const iend = istart + srcSize; 1527 const BYTE* const ilimit = (searchMethod == search_rowHash) ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8; 1528 const BYTE* const base = ms->window.base; 1529 const U32 prefixLowestIndex = ms->window.dictLimit; 1530 const BYTE* const prefixLowest = base + prefixLowestIndex; 1531 const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6); 1532 const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); 1533 1534 U32 offset_1 = rep[0], offset_2 = rep[1]; 1535 U32 offsetSaved1 = 0, offsetSaved2 = 0; 1536 1537 const int isDMS = dictMode == ZSTD_dictMatchState; 1538 const int isDDS = dictMode == ZSTD_dedicatedDictSearch; 1539 const int isDxS = isDMS || isDDS; 1540 const ZSTD_matchState_t* const dms = ms->dictMatchState; 1541 const U32 dictLowestIndex = isDxS ? dms->window.dictLimit : 0; 1542 const BYTE* const dictBase = isDxS ? dms->window.base : NULL; 1543 const BYTE* const dictLowest = isDxS ? dictBase + dictLowestIndex : NULL; 1544 const BYTE* const dictEnd = isDxS ? dms->window.nextSrc : NULL; 1545 const U32 dictIndexDelta = isDxS ? 1546 prefixLowestIndex - (U32)(dictEnd - dictBase) : 1547 0; 1548 const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest)); 1549 1550 DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod); 1551 ip += (dictAndPrefixLength == 0); 1552 if (dictMode == ZSTD_noDict) { 1553 U32 const curr = (U32)(ip - base); 1554 U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog); 1555 U32 const maxRep = curr - windowLow; 1556 if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0; 1557 if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0; 1558 } 1559 if (isDxS) { 1560 /* dictMatchState repCode checks don't currently handle repCode == 0 1561 * disabling. */ 1562 assert(offset_1 <= dictAndPrefixLength); 1563 assert(offset_2 <= dictAndPrefixLength); 1564 } 1565 1566 /* Reset the lazy skipping state */ 1567 ms->lazySkipping = 0; 1568 1569 if (searchMethod == search_rowHash) { 1570 ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); 1571 } 1572 1573 /* Match Loop */ 1574 #if defined(__GNUC__) && defined(__x86_64__) 1575 /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the 1576 * code alignment is perturbed. To fix the instability align the loop on 32-bytes. 1577 */ 1578 __asm__(".p2align 5"); 1579 #endif 1580 while (ip < ilimit) { 1581 size_t matchLength=0; 1582 size_t offBase = REPCODE1_TO_OFFBASE; 1583 const BYTE* start=ip+1; 1584 DEBUGLOG(7, "search baseline (depth 0)"); 1585 1586 /* check repCode */ 1587 if (isDxS) { 1588 const U32 repIndex = (U32)(ip - base) + 1 - offset_1; 1589 const BYTE* repMatch = ((dictMode == ZSTD_dictMatchState || dictMode == ZSTD_dedicatedDictSearch) 1590 && repIndex < prefixLowestIndex) ? 1591 dictBase + (repIndex - dictIndexDelta) : 1592 base + repIndex; 1593 if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) 1594 && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { 1595 const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; 1596 matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; 1597 if (depth==0) goto _storeSequence; 1598 } 1599 } 1600 if ( dictMode == ZSTD_noDict 1601 && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) { 1602 matchLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; 1603 if (depth==0) goto _storeSequence; 1604 } 1605 1606 /* first search (depth 0) */ 1607 { size_t offbaseFound = 999999999; 1608 size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode); 1609 if (ml2 > matchLength) 1610 matchLength = ml2, start = ip, offBase = offbaseFound; 1611 } 1612 1613 if (matchLength < 4) { 1614 size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */; 1615 ip += step; 1616 /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time. 1617 * In this mode we stop inserting every position into our tables, and only insert 1618 * positions that we search, which is one in step positions. 1619 * The exact cutoff is flexible, I've just chosen a number that is reasonably high, 1620 * so we minimize the compression ratio loss in "normal" scenarios. This mode gets 1621 * triggered once we've gone 2KB without finding any matches. 1622 */ 1623 ms->lazySkipping = step > kLazySkippingStep; 1624 continue; 1625 } 1626 1627 /* let's try to find a better solution */ 1628 if (depth>=1) 1629 while (ip<ilimit) { 1630 DEBUGLOG(7, "search depth 1"); 1631 ip ++; 1632 if ( (dictMode == ZSTD_noDict) 1633 && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { 1634 size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; 1635 int const gain2 = (int)(mlRep * 3); 1636 int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); 1637 if ((mlRep >= 4) && (gain2 > gain1)) 1638 matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; 1639 } 1640 if (isDxS) { 1641 const U32 repIndex = (U32)(ip - base) - offset_1; 1642 const BYTE* repMatch = repIndex < prefixLowestIndex ? 1643 dictBase + (repIndex - dictIndexDelta) : 1644 base + repIndex; 1645 if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) 1646 && (MEM_read32(repMatch) == MEM_read32(ip)) ) { 1647 const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; 1648 size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; 1649 int const gain2 = (int)(mlRep * 3); 1650 int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); 1651 if ((mlRep >= 4) && (gain2 > gain1)) 1652 matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; 1653 } 1654 } 1655 { size_t ofbCandidate=999999999; 1656 size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode); 1657 int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ 1658 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4); 1659 if ((ml2 >= 4) && (gain2 > gain1)) { 1660 matchLength = ml2, offBase = ofbCandidate, start = ip; 1661 continue; /* search a better one */ 1662 } } 1663 1664 /* let's find an even better one */ 1665 if ((depth==2) && (ip<ilimit)) { 1666 DEBUGLOG(7, "search depth 2"); 1667 ip ++; 1668 if ( (dictMode == ZSTD_noDict) 1669 && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { 1670 size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; 1671 int const gain2 = (int)(mlRep * 4); 1672 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); 1673 if ((mlRep >= 4) && (gain2 > gain1)) 1674 matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; 1675 } 1676 if (isDxS) { 1677 const U32 repIndex = (U32)(ip - base) - offset_1; 1678 const BYTE* repMatch = repIndex < prefixLowestIndex ? 1679 dictBase + (repIndex - dictIndexDelta) : 1680 base + repIndex; 1681 if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) 1682 && (MEM_read32(repMatch) == MEM_read32(ip)) ) { 1683 const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; 1684 size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; 1685 int const gain2 = (int)(mlRep * 4); 1686 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); 1687 if ((mlRep >= 4) && (gain2 > gain1)) 1688 matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; 1689 } 1690 } 1691 { size_t ofbCandidate=999999999; 1692 size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode); 1693 int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ 1694 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7); 1695 if ((ml2 >= 4) && (gain2 > gain1)) { 1696 matchLength = ml2, offBase = ofbCandidate, start = ip; 1697 continue; 1698 } } } 1699 break; /* nothing found : store previous solution */ 1700 } 1701 1702 /* NOTE: 1703 * Pay attention that `start[-value]` can lead to strange undefined behavior 1704 * notably if `value` is unsigned, resulting in a large positive `-value`. 1705 */ 1706 /* catch up */ 1707 if (OFFBASE_IS_OFFSET(offBase)) { 1708 if (dictMode == ZSTD_noDict) { 1709 while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest)) 1710 && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) ) /* only search for offset within prefix */ 1711 { start--; matchLength++; } 1712 } 1713 if (isDxS) { 1714 U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase)); 1715 const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex; 1716 const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest; 1717 while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ 1718 } 1719 offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase); 1720 } 1721 /* store sequence */ 1722 _storeSequence: 1723 { size_t const litLength = (size_t)(start - anchor); 1724 ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength); 1725 anchor = ip = start + matchLength; 1726 } 1727 if (ms->lazySkipping) { 1728 /* We've found a match, disable lazy skipping mode, and refill the hash cache. */ 1729 if (searchMethod == search_rowHash) { 1730 ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); 1731 } 1732 ms->lazySkipping = 0; 1733 } 1734 1735 /* check immediate repcode */ 1736 if (isDxS) { 1737 while (ip <= ilimit) { 1738 U32 const current2 = (U32)(ip-base); 1739 U32 const repIndex = current2 - offset_2; 1740 const BYTE* repMatch = repIndex < prefixLowestIndex ? 1741 dictBase - dictIndexDelta + repIndex : 1742 base + repIndex; 1743 if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex) >= 3 /* intentional overflow */) 1744 && (MEM_read32(repMatch) == MEM_read32(ip)) ) { 1745 const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend; 1746 matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4; 1747 offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset_2 <=> offset_1 */ 1748 ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); 1749 ip += matchLength; 1750 anchor = ip; 1751 continue; 1752 } 1753 break; 1754 } 1755 } 1756 1757 if (dictMode == ZSTD_noDict) { 1758 while ( ((ip <= ilimit) & (offset_2>0)) 1759 && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) { 1760 /* store sequence */ 1761 matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; 1762 offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */ 1763 ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); 1764 ip += matchLength; 1765 anchor = ip; 1766 continue; /* faster when present ... (?) */ 1767 } } } 1768 1769 /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), 1770 * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ 1771 offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; 1772 1773 /* save reps for next block */ 1774 rep[0] = offset_1 ? offset_1 : offsetSaved1; 1775 rep[1] = offset_2 ? offset_2 : offsetSaved2; 1776 1777 /* Return the last literals size */ 1778 return (size_t)(iend - anchor); 1779 } 1780 #endif /* build exclusions */ 1781 1782 1783 #ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR 1784 size_t ZSTD_compressBlock_greedy( 1785 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1786 void const* src, size_t srcSize) 1787 { 1788 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict); 1789 } 1790 1791 size_t ZSTD_compressBlock_greedy_dictMatchState( 1792 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1793 void const* src, size_t srcSize) 1794 { 1795 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState); 1796 } 1797 1798 size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( 1799 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1800 void const* src, size_t srcSize) 1801 { 1802 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch); 1803 } 1804 1805 size_t ZSTD_compressBlock_greedy_row( 1806 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1807 void const* src, size_t srcSize) 1808 { 1809 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict); 1810 } 1811 1812 size_t ZSTD_compressBlock_greedy_dictMatchState_row( 1813 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1814 void const* src, size_t srcSize) 1815 { 1816 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState); 1817 } 1818 1819 size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( 1820 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1821 void const* src, size_t srcSize) 1822 { 1823 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch); 1824 } 1825 #endif 1826 1827 #ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR 1828 size_t ZSTD_compressBlock_lazy( 1829 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1830 void const* src, size_t srcSize) 1831 { 1832 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict); 1833 } 1834 1835 size_t ZSTD_compressBlock_lazy_dictMatchState( 1836 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1837 void const* src, size_t srcSize) 1838 { 1839 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState); 1840 } 1841 1842 size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( 1843 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1844 void const* src, size_t srcSize) 1845 { 1846 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch); 1847 } 1848 1849 size_t ZSTD_compressBlock_lazy_row( 1850 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1851 void const* src, size_t srcSize) 1852 { 1853 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict); 1854 } 1855 1856 size_t ZSTD_compressBlock_lazy_dictMatchState_row( 1857 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1858 void const* src, size_t srcSize) 1859 { 1860 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState); 1861 } 1862 1863 size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( 1864 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1865 void const* src, size_t srcSize) 1866 { 1867 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch); 1868 } 1869 #endif 1870 1871 #ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR 1872 size_t ZSTD_compressBlock_lazy2( 1873 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1874 void const* src, size_t srcSize) 1875 { 1876 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict); 1877 } 1878 1879 size_t ZSTD_compressBlock_lazy2_dictMatchState( 1880 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1881 void const* src, size_t srcSize) 1882 { 1883 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState); 1884 } 1885 1886 size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( 1887 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1888 void const* src, size_t srcSize) 1889 { 1890 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch); 1891 } 1892 1893 size_t ZSTD_compressBlock_lazy2_row( 1894 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1895 void const* src, size_t srcSize) 1896 { 1897 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict); 1898 } 1899 1900 size_t ZSTD_compressBlock_lazy2_dictMatchState_row( 1901 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1902 void const* src, size_t srcSize) 1903 { 1904 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState); 1905 } 1906 1907 size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( 1908 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1909 void const* src, size_t srcSize) 1910 { 1911 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch); 1912 } 1913 #endif 1914 1915 #ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR 1916 size_t ZSTD_compressBlock_btlazy2( 1917 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1918 void const* src, size_t srcSize) 1919 { 1920 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict); 1921 } 1922 1923 size_t ZSTD_compressBlock_btlazy2_dictMatchState( 1924 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1925 void const* src, size_t srcSize) 1926 { 1927 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState); 1928 } 1929 #endif 1930 1931 #if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ 1932 || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ 1933 || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ 1934 || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) 1935 FORCE_INLINE_TEMPLATE 1936 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 1937 size_t ZSTD_compressBlock_lazy_extDict_generic( 1938 ZSTD_matchState_t* ms, seqStore_t* seqStore, 1939 U32 rep[ZSTD_REP_NUM], 1940 const void* src, size_t srcSize, 1941 const searchMethod_e searchMethod, const U32 depth) 1942 { 1943 const BYTE* const istart = (const BYTE*)src; 1944 const BYTE* ip = istart; 1945 const BYTE* anchor = istart; 1946 const BYTE* const iend = istart + srcSize; 1947 const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8; 1948 const BYTE* const base = ms->window.base; 1949 const U32 dictLimit = ms->window.dictLimit; 1950 const BYTE* const prefixStart = base + dictLimit; 1951 const BYTE* const dictBase = ms->window.dictBase; 1952 const BYTE* const dictEnd = dictBase + dictLimit; 1953 const BYTE* const dictStart = dictBase + ms->window.lowLimit; 1954 const U32 windowLog = ms->cParams.windowLog; 1955 const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6); 1956 const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); 1957 1958 U32 offset_1 = rep[0], offset_2 = rep[1]; 1959 1960 DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod); 1961 1962 /* Reset the lazy skipping state */ 1963 ms->lazySkipping = 0; 1964 1965 /* init */ 1966 ip += (ip == prefixStart); 1967 if (searchMethod == search_rowHash) { 1968 ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); 1969 } 1970 1971 /* Match Loop */ 1972 #if defined(__GNUC__) && defined(__x86_64__) 1973 /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the 1974 * code alignment is perturbed. To fix the instability align the loop on 32-bytes. 1975 */ 1976 __asm__(".p2align 5"); 1977 #endif 1978 while (ip < ilimit) { 1979 size_t matchLength=0; 1980 size_t offBase = REPCODE1_TO_OFFBASE; 1981 const BYTE* start=ip+1; 1982 U32 curr = (U32)(ip-base); 1983 1984 /* check repCode */ 1985 { const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr+1, windowLog); 1986 const U32 repIndex = (U32)(curr+1 - offset_1); 1987 const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; 1988 const BYTE* const repMatch = repBase + repIndex; 1989 if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */ 1990 & (offset_1 <= curr+1 - windowLow) ) /* note: we are searching at curr+1 */ 1991 if (MEM_read32(ip+1) == MEM_read32(repMatch)) { 1992 /* repcode detected we should take it */ 1993 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; 1994 matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repEnd, prefixStart) + 4; 1995 if (depth==0) goto _storeSequence; 1996 } } 1997 1998 /* first search (depth 0) */ 1999 { size_t ofbCandidate = 999999999; 2000 size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); 2001 if (ml2 > matchLength) 2002 matchLength = ml2, start = ip, offBase = ofbCandidate; 2003 } 2004 2005 if (matchLength < 4) { 2006 size_t const step = ((size_t)(ip-anchor) >> kSearchStrength); 2007 ip += step + 1; /* jump faster over incompressible sections */ 2008 /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time. 2009 * In this mode we stop inserting every position into our tables, and only insert 2010 * positions that we search, which is one in step positions. 2011 * The exact cutoff is flexible, I've just chosen a number that is reasonably high, 2012 * so we minimize the compression ratio loss in "normal" scenarios. This mode gets 2013 * triggered once we've gone 2KB without finding any matches. 2014 */ 2015 ms->lazySkipping = step > kLazySkippingStep; 2016 continue; 2017 } 2018 2019 /* let's try to find a better solution */ 2020 if (depth>=1) 2021 while (ip<ilimit) { 2022 ip ++; 2023 curr++; 2024 /* check repCode */ 2025 if (offBase) { 2026 const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); 2027 const U32 repIndex = (U32)(curr - offset_1); 2028 const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; 2029 const BYTE* const repMatch = repBase + repIndex; 2030 if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */ 2031 & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ 2032 if (MEM_read32(ip) == MEM_read32(repMatch)) { 2033 /* repcode detected */ 2034 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; 2035 size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; 2036 int const gain2 = (int)(repLength * 3); 2037 int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); 2038 if ((repLength >= 4) && (gain2 > gain1)) 2039 matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip; 2040 } } 2041 2042 /* search match, depth 1 */ 2043 { size_t ofbCandidate = 999999999; 2044 size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); 2045 int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ 2046 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4); 2047 if ((ml2 >= 4) && (gain2 > gain1)) { 2048 matchLength = ml2, offBase = ofbCandidate, start = ip; 2049 continue; /* search a better one */ 2050 } } 2051 2052 /* let's find an even better one */ 2053 if ((depth==2) && (ip<ilimit)) { 2054 ip ++; 2055 curr++; 2056 /* check repCode */ 2057 if (offBase) { 2058 const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); 2059 const U32 repIndex = (U32)(curr - offset_1); 2060 const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; 2061 const BYTE* const repMatch = repBase + repIndex; 2062 if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */ 2063 & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ 2064 if (MEM_read32(ip) == MEM_read32(repMatch)) { 2065 /* repcode detected */ 2066 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; 2067 size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; 2068 int const gain2 = (int)(repLength * 4); 2069 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); 2070 if ((repLength >= 4) && (gain2 > gain1)) 2071 matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip; 2072 } } 2073 2074 /* search match, depth 2 */ 2075 { size_t ofbCandidate = 999999999; 2076 size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); 2077 int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ 2078 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7); 2079 if ((ml2 >= 4) && (gain2 > gain1)) { 2080 matchLength = ml2, offBase = ofbCandidate, start = ip; 2081 continue; 2082 } } } 2083 break; /* nothing found : store previous solution */ 2084 } 2085 2086 /* catch up */ 2087 if (OFFBASE_IS_OFFSET(offBase)) { 2088 U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase)); 2089 const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex; 2090 const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart; 2091 while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ 2092 offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase); 2093 } 2094 2095 /* store sequence */ 2096 _storeSequence: 2097 { size_t const litLength = (size_t)(start - anchor); 2098 ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength); 2099 anchor = ip = start + matchLength; 2100 } 2101 if (ms->lazySkipping) { 2102 /* We've found a match, disable lazy skipping mode, and refill the hash cache. */ 2103 if (searchMethod == search_rowHash) { 2104 ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); 2105 } 2106 ms->lazySkipping = 0; 2107 } 2108 2109 /* check immediate repcode */ 2110 while (ip <= ilimit) { 2111 const U32 repCurrent = (U32)(ip-base); 2112 const U32 windowLow = ZSTD_getLowestMatchIndex(ms, repCurrent, windowLog); 2113 const U32 repIndex = repCurrent - offset_2; 2114 const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; 2115 const BYTE* const repMatch = repBase + repIndex; 2116 if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */ 2117 & (offset_2 <= repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ 2118 if (MEM_read32(ip) == MEM_read32(repMatch)) { 2119 /* repcode detected we should take it */ 2120 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; 2121 matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; 2122 offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset history */ 2123 ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); 2124 ip += matchLength; 2125 anchor = ip; 2126 continue; /* faster when present ... (?) */ 2127 } 2128 break; 2129 } } 2130 2131 /* Save reps for next block */ 2132 rep[0] = offset_1; 2133 rep[1] = offset_2; 2134 2135 /* Return the last literals size */ 2136 return (size_t)(iend - anchor); 2137 } 2138 #endif /* build exclusions */ 2139 2140 #ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR 2141 size_t ZSTD_compressBlock_greedy_extDict( 2142 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 2143 void const* src, size_t srcSize) 2144 { 2145 return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0); 2146 } 2147 2148 size_t ZSTD_compressBlock_greedy_extDict_row( 2149 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 2150 void const* src, size_t srcSize) 2151 { 2152 return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0); 2153 } 2154 #endif 2155 2156 #ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR 2157 size_t ZSTD_compressBlock_lazy_extDict( 2158 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 2159 void const* src, size_t srcSize) 2160 2161 { 2162 return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1); 2163 } 2164 2165 size_t ZSTD_compressBlock_lazy_extDict_row( 2166 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 2167 void const* src, size_t srcSize) 2168 2169 { 2170 return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1); 2171 } 2172 #endif 2173 2174 #ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR 2175 size_t ZSTD_compressBlock_lazy2_extDict( 2176 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 2177 void const* src, size_t srcSize) 2178 2179 { 2180 return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2); 2181 } 2182 2183 size_t ZSTD_compressBlock_lazy2_extDict_row( 2184 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 2185 void const* src, size_t srcSize) 2186 { 2187 return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2); 2188 } 2189 #endif 2190 2191 #ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR 2192 size_t ZSTD_compressBlock_btlazy2_extDict( 2193 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 2194 void const* src, size_t srcSize) 2195 2196 { 2197 return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2); 2198 } 2199 #endif 2200