1*3117ece4Schristos /* 2*3117ece4Schristos * Copyright (c) Meta Platforms, Inc. and affiliates. 3*3117ece4Schristos * All rights reserved. 4*3117ece4Schristos * 5*3117ece4Schristos * This source code is licensed under both the BSD-style license (found in the 6*3117ece4Schristos * LICENSE file in the root directory of this source tree) and the GPLv2 (found 7*3117ece4Schristos * in the COPYING file in the root directory of this source tree). 8*3117ece4Schristos * You may select, at your option, one of the above-listed licenses. 9*3117ece4Schristos */ 10*3117ece4Schristos 11*3117ece4Schristos #include "zstd_compress_internal.h" 12*3117ece4Schristos #include "zstd_lazy.h" 13*3117ece4Schristos #include "../common/bits.h" /* ZSTD_countTrailingZeros64 */ 14*3117ece4Schristos 15*3117ece4Schristos #if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ 16*3117ece4Schristos || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ 17*3117ece4Schristos || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ 18*3117ece4Schristos || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) 19*3117ece4Schristos 20*3117ece4Schristos #define kLazySkippingStep 8 21*3117ece4Schristos 22*3117ece4Schristos 23*3117ece4Schristos /*-************************************* 24*3117ece4Schristos * Binary Tree search 25*3117ece4Schristos ***************************************/ 26*3117ece4Schristos 27*3117ece4Schristos static 28*3117ece4Schristos ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 29*3117ece4Schristos void ZSTD_updateDUBT(ZSTD_matchState_t* ms, 30*3117ece4Schristos const BYTE* ip, const BYTE* iend, 31*3117ece4Schristos U32 mls) 32*3117ece4Schristos { 33*3117ece4Schristos const ZSTD_compressionParameters* const cParams = &ms->cParams; 34*3117ece4Schristos U32* const hashTable = ms->hashTable; 35*3117ece4Schristos U32 const hashLog = cParams->hashLog; 36*3117ece4Schristos 37*3117ece4Schristos U32* const bt = ms->chainTable; 38*3117ece4Schristos U32 const btLog = cParams->chainLog - 1; 39*3117ece4Schristos U32 const btMask = (1 << btLog) - 1; 40*3117ece4Schristos 41*3117ece4Schristos const BYTE* const base = ms->window.base; 42*3117ece4Schristos U32 const target = (U32)(ip - base); 43*3117ece4Schristos U32 idx = ms->nextToUpdate; 44*3117ece4Schristos 45*3117ece4Schristos if (idx != target) 46*3117ece4Schristos DEBUGLOG(7, "ZSTD_updateDUBT, from %u to %u (dictLimit:%u)", 47*3117ece4Schristos idx, target, ms->window.dictLimit); 48*3117ece4Schristos assert(ip + 8 <= iend); /* condition for ZSTD_hashPtr */ 49*3117ece4Schristos (void)iend; 50*3117ece4Schristos 51*3117ece4Schristos assert(idx >= ms->window.dictLimit); /* condition for valid base+idx */ 52*3117ece4Schristos for ( ; idx < target ; idx++) { 53*3117ece4Schristos size_t const h = ZSTD_hashPtr(base + idx, hashLog, mls); /* assumption : ip + 8 <= iend */ 54*3117ece4Schristos U32 const matchIndex = hashTable[h]; 55*3117ece4Schristos 56*3117ece4Schristos U32* const nextCandidatePtr = bt + 2*(idx&btMask); 57*3117ece4Schristos U32* const sortMarkPtr = nextCandidatePtr + 1; 58*3117ece4Schristos 59*3117ece4Schristos DEBUGLOG(8, "ZSTD_updateDUBT: insert %u", idx); 60*3117ece4Schristos hashTable[h] = idx; /* Update Hash Table */ 61*3117ece4Schristos *nextCandidatePtr = matchIndex; /* update BT like a chain */ 62*3117ece4Schristos *sortMarkPtr = ZSTD_DUBT_UNSORTED_MARK; 63*3117ece4Schristos } 64*3117ece4Schristos ms->nextToUpdate = target; 65*3117ece4Schristos } 66*3117ece4Schristos 67*3117ece4Schristos 68*3117ece4Schristos /** ZSTD_insertDUBT1() : 69*3117ece4Schristos * sort one already inserted but unsorted position 70*3117ece4Schristos * assumption : curr >= btlow == (curr - btmask) 71*3117ece4Schristos * doesn't fail */ 72*3117ece4Schristos static 73*3117ece4Schristos ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 74*3117ece4Schristos void ZSTD_insertDUBT1(const ZSTD_matchState_t* ms, 75*3117ece4Schristos U32 curr, const BYTE* inputEnd, 76*3117ece4Schristos U32 nbCompares, U32 btLow, 77*3117ece4Schristos const ZSTD_dictMode_e dictMode) 78*3117ece4Schristos { 79*3117ece4Schristos const ZSTD_compressionParameters* const cParams = &ms->cParams; 80*3117ece4Schristos U32* const bt = ms->chainTable; 81*3117ece4Schristos U32 const btLog = cParams->chainLog - 1; 82*3117ece4Schristos U32 const btMask = (1 << btLog) - 1; 83*3117ece4Schristos size_t commonLengthSmaller=0, commonLengthLarger=0; 84*3117ece4Schristos const BYTE* const base = ms->window.base; 85*3117ece4Schristos const BYTE* const dictBase = ms->window.dictBase; 86*3117ece4Schristos const U32 dictLimit = ms->window.dictLimit; 87*3117ece4Schristos const BYTE* const ip = (curr>=dictLimit) ? base + curr : dictBase + curr; 88*3117ece4Schristos const BYTE* const iend = (curr>=dictLimit) ? inputEnd : dictBase + dictLimit; 89*3117ece4Schristos const BYTE* const dictEnd = dictBase + dictLimit; 90*3117ece4Schristos const BYTE* const prefixStart = base + dictLimit; 91*3117ece4Schristos const BYTE* match; 92*3117ece4Schristos U32* smallerPtr = bt + 2*(curr&btMask); 93*3117ece4Schristos U32* largerPtr = smallerPtr + 1; 94*3117ece4Schristos U32 matchIndex = *smallerPtr; /* this candidate is unsorted : next sorted candidate is reached through *smallerPtr, while *largerPtr contains previous unsorted candidate (which is already saved and can be overwritten) */ 95*3117ece4Schristos U32 dummy32; /* to be nullified at the end */ 96*3117ece4Schristos U32 const windowValid = ms->window.lowLimit; 97*3117ece4Schristos U32 const maxDistance = 1U << cParams->windowLog; 98*3117ece4Schristos U32 const windowLow = (curr - windowValid > maxDistance) ? curr - maxDistance : windowValid; 99*3117ece4Schristos 100*3117ece4Schristos 101*3117ece4Schristos DEBUGLOG(8, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)", 102*3117ece4Schristos curr, dictLimit, windowLow); 103*3117ece4Schristos assert(curr >= btLow); 104*3117ece4Schristos assert(ip < iend); /* condition for ZSTD_count */ 105*3117ece4Schristos 106*3117ece4Schristos for (; nbCompares && (matchIndex > windowLow); --nbCompares) { 107*3117ece4Schristos U32* const nextPtr = bt + 2*(matchIndex & btMask); 108*3117ece4Schristos size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ 109*3117ece4Schristos assert(matchIndex < curr); 110*3117ece4Schristos /* note : all candidates are now supposed sorted, 111*3117ece4Schristos * but it's still possible to have nextPtr[1] == ZSTD_DUBT_UNSORTED_MARK 112*3117ece4Schristos * when a real index has the same value as ZSTD_DUBT_UNSORTED_MARK */ 113*3117ece4Schristos 114*3117ece4Schristos if ( (dictMode != ZSTD_extDict) 115*3117ece4Schristos || (matchIndex+matchLength >= dictLimit) /* both in current segment*/ 116*3117ece4Schristos || (curr < dictLimit) /* both in extDict */) { 117*3117ece4Schristos const BYTE* const mBase = ( (dictMode != ZSTD_extDict) 118*3117ece4Schristos || (matchIndex+matchLength >= dictLimit)) ? 119*3117ece4Schristos base : dictBase; 120*3117ece4Schristos assert( (matchIndex+matchLength >= dictLimit) /* might be wrong if extDict is incorrectly set to 0 */ 121*3117ece4Schristos || (curr < dictLimit) ); 122*3117ece4Schristos match = mBase + matchIndex; 123*3117ece4Schristos matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend); 124*3117ece4Schristos } else { 125*3117ece4Schristos match = dictBase + matchIndex; 126*3117ece4Schristos matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart); 127*3117ece4Schristos if (matchIndex+matchLength >= dictLimit) 128*3117ece4Schristos match = base + matchIndex; /* preparation for next read of match[matchLength] */ 129*3117ece4Schristos } 130*3117ece4Schristos 131*3117ece4Schristos DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ", 132*3117ece4Schristos curr, matchIndex, (U32)matchLength); 133*3117ece4Schristos 134*3117ece4Schristos if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ 135*3117ece4Schristos break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */ 136*3117ece4Schristos } 137*3117ece4Schristos 138*3117ece4Schristos if (match[matchLength] < ip[matchLength]) { /* necessarily within buffer */ 139*3117ece4Schristos /* match is smaller than current */ 140*3117ece4Schristos *smallerPtr = matchIndex; /* update smaller idx */ 141*3117ece4Schristos commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ 142*3117ece4Schristos if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop searching */ 143*3117ece4Schristos DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is smaller : next => %u", 144*3117ece4Schristos matchIndex, btLow, nextPtr[1]); 145*3117ece4Schristos smallerPtr = nextPtr+1; /* new "candidate" => larger than match, which was smaller than target */ 146*3117ece4Schristos matchIndex = nextPtr[1]; /* new matchIndex, larger than previous and closer to current */ 147*3117ece4Schristos } else { 148*3117ece4Schristos /* match is larger than current */ 149*3117ece4Schristos *largerPtr = matchIndex; 150*3117ece4Schristos commonLengthLarger = matchLength; 151*3117ece4Schristos if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop searching */ 152*3117ece4Schristos DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is larger => %u", 153*3117ece4Schristos matchIndex, btLow, nextPtr[0]); 154*3117ece4Schristos largerPtr = nextPtr; 155*3117ece4Schristos matchIndex = nextPtr[0]; 156*3117ece4Schristos } } 157*3117ece4Schristos 158*3117ece4Schristos *smallerPtr = *largerPtr = 0; 159*3117ece4Schristos } 160*3117ece4Schristos 161*3117ece4Schristos 162*3117ece4Schristos static 163*3117ece4Schristos ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 164*3117ece4Schristos size_t ZSTD_DUBT_findBetterDictMatch ( 165*3117ece4Schristos const ZSTD_matchState_t* ms, 166*3117ece4Schristos const BYTE* const ip, const BYTE* const iend, 167*3117ece4Schristos size_t* offsetPtr, 168*3117ece4Schristos size_t bestLength, 169*3117ece4Schristos U32 nbCompares, 170*3117ece4Schristos U32 const mls, 171*3117ece4Schristos const ZSTD_dictMode_e dictMode) 172*3117ece4Schristos { 173*3117ece4Schristos const ZSTD_matchState_t * const dms = ms->dictMatchState; 174*3117ece4Schristos const ZSTD_compressionParameters* const dmsCParams = &dms->cParams; 175*3117ece4Schristos const U32 * const dictHashTable = dms->hashTable; 176*3117ece4Schristos U32 const hashLog = dmsCParams->hashLog; 177*3117ece4Schristos size_t const h = ZSTD_hashPtr(ip, hashLog, mls); 178*3117ece4Schristos U32 dictMatchIndex = dictHashTable[h]; 179*3117ece4Schristos 180*3117ece4Schristos const BYTE* const base = ms->window.base; 181*3117ece4Schristos const BYTE* const prefixStart = base + ms->window.dictLimit; 182*3117ece4Schristos U32 const curr = (U32)(ip-base); 183*3117ece4Schristos const BYTE* const dictBase = dms->window.base; 184*3117ece4Schristos const BYTE* const dictEnd = dms->window.nextSrc; 185*3117ece4Schristos U32 const dictHighLimit = (U32)(dms->window.nextSrc - dms->window.base); 186*3117ece4Schristos U32 const dictLowLimit = dms->window.lowLimit; 187*3117ece4Schristos U32 const dictIndexDelta = ms->window.lowLimit - dictHighLimit; 188*3117ece4Schristos 189*3117ece4Schristos U32* const dictBt = dms->chainTable; 190*3117ece4Schristos U32 const btLog = dmsCParams->chainLog - 1; 191*3117ece4Schristos U32 const btMask = (1 << btLog) - 1; 192*3117ece4Schristos U32 const btLow = (btMask >= dictHighLimit - dictLowLimit) ? dictLowLimit : dictHighLimit - btMask; 193*3117ece4Schristos 194*3117ece4Schristos size_t commonLengthSmaller=0, commonLengthLarger=0; 195*3117ece4Schristos 196*3117ece4Schristos (void)dictMode; 197*3117ece4Schristos assert(dictMode == ZSTD_dictMatchState); 198*3117ece4Schristos 199*3117ece4Schristos for (; nbCompares && (dictMatchIndex > dictLowLimit); --nbCompares) { 200*3117ece4Schristos U32* const nextPtr = dictBt + 2*(dictMatchIndex & btMask); 201*3117ece4Schristos size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ 202*3117ece4Schristos const BYTE* match = dictBase + dictMatchIndex; 203*3117ece4Schristos matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart); 204*3117ece4Schristos if (dictMatchIndex+matchLength >= dictHighLimit) 205*3117ece4Schristos match = base + dictMatchIndex + dictIndexDelta; /* to prepare for next usage of match[matchLength] */ 206*3117ece4Schristos 207*3117ece4Schristos if (matchLength > bestLength) { 208*3117ece4Schristos U32 matchIndex = dictMatchIndex + dictIndexDelta; 209*3117ece4Schristos if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) { 210*3117ece4Schristos DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)", 211*3117ece4Schristos curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex); 212*3117ece4Schristos bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); 213*3117ece4Schristos } 214*3117ece4Schristos if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */ 215*3117ece4Schristos break; /* drop, to guarantee consistency (miss a little bit of compression) */ 216*3117ece4Schristos } 217*3117ece4Schristos } 218*3117ece4Schristos 219*3117ece4Schristos if (match[matchLength] < ip[matchLength]) { 220*3117ece4Schristos if (dictMatchIndex <= btLow) { break; } /* beyond tree size, stop the search */ 221*3117ece4Schristos commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ 222*3117ece4Schristos dictMatchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */ 223*3117ece4Schristos } else { 224*3117ece4Schristos /* match is larger than current */ 225*3117ece4Schristos if (dictMatchIndex <= btLow) { break; } /* beyond tree size, stop the search */ 226*3117ece4Schristos commonLengthLarger = matchLength; 227*3117ece4Schristos dictMatchIndex = nextPtr[0]; 228*3117ece4Schristos } 229*3117ece4Schristos } 230*3117ece4Schristos 231*3117ece4Schristos if (bestLength >= MINMATCH) { 232*3117ece4Schristos U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex; 233*3117ece4Schristos DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)", 234*3117ece4Schristos curr, (U32)bestLength, (U32)*offsetPtr, mIndex); 235*3117ece4Schristos } 236*3117ece4Schristos return bestLength; 237*3117ece4Schristos 238*3117ece4Schristos } 239*3117ece4Schristos 240*3117ece4Schristos 241*3117ece4Schristos static 242*3117ece4Schristos ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 243*3117ece4Schristos size_t ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, 244*3117ece4Schristos const BYTE* const ip, const BYTE* const iend, 245*3117ece4Schristos size_t* offBasePtr, 246*3117ece4Schristos U32 const mls, 247*3117ece4Schristos const ZSTD_dictMode_e dictMode) 248*3117ece4Schristos { 249*3117ece4Schristos const ZSTD_compressionParameters* const cParams = &ms->cParams; 250*3117ece4Schristos U32* const hashTable = ms->hashTable; 251*3117ece4Schristos U32 const hashLog = cParams->hashLog; 252*3117ece4Schristos size_t const h = ZSTD_hashPtr(ip, hashLog, mls); 253*3117ece4Schristos U32 matchIndex = hashTable[h]; 254*3117ece4Schristos 255*3117ece4Schristos const BYTE* const base = ms->window.base; 256*3117ece4Schristos U32 const curr = (U32)(ip-base); 257*3117ece4Schristos U32 const windowLow = ZSTD_getLowestMatchIndex(ms, curr, cParams->windowLog); 258*3117ece4Schristos 259*3117ece4Schristos U32* const bt = ms->chainTable; 260*3117ece4Schristos U32 const btLog = cParams->chainLog - 1; 261*3117ece4Schristos U32 const btMask = (1 << btLog) - 1; 262*3117ece4Schristos U32 const btLow = (btMask >= curr) ? 0 : curr - btMask; 263*3117ece4Schristos U32 const unsortLimit = MAX(btLow, windowLow); 264*3117ece4Schristos 265*3117ece4Schristos U32* nextCandidate = bt + 2*(matchIndex&btMask); 266*3117ece4Schristos U32* unsortedMark = bt + 2*(matchIndex&btMask) + 1; 267*3117ece4Schristos U32 nbCompares = 1U << cParams->searchLog; 268*3117ece4Schristos U32 nbCandidates = nbCompares; 269*3117ece4Schristos U32 previousCandidate = 0; 270*3117ece4Schristos 271*3117ece4Schristos DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", curr); 272*3117ece4Schristos assert(ip <= iend-8); /* required for h calculation */ 273*3117ece4Schristos assert(dictMode != ZSTD_dedicatedDictSearch); 274*3117ece4Schristos 275*3117ece4Schristos /* reach end of unsorted candidates list */ 276*3117ece4Schristos while ( (matchIndex > unsortLimit) 277*3117ece4Schristos && (*unsortedMark == ZSTD_DUBT_UNSORTED_MARK) 278*3117ece4Schristos && (nbCandidates > 1) ) { 279*3117ece4Schristos DEBUGLOG(8, "ZSTD_DUBT_findBestMatch: candidate %u is unsorted", 280*3117ece4Schristos matchIndex); 281*3117ece4Schristos *unsortedMark = previousCandidate; /* the unsortedMark becomes a reversed chain, to move up back to original position */ 282*3117ece4Schristos previousCandidate = matchIndex; 283*3117ece4Schristos matchIndex = *nextCandidate; 284*3117ece4Schristos nextCandidate = bt + 2*(matchIndex&btMask); 285*3117ece4Schristos unsortedMark = bt + 2*(matchIndex&btMask) + 1; 286*3117ece4Schristos nbCandidates --; 287*3117ece4Schristos } 288*3117ece4Schristos 289*3117ece4Schristos /* nullify last candidate if it's still unsorted 290*3117ece4Schristos * simplification, detrimental to compression ratio, beneficial for speed */ 291*3117ece4Schristos if ( (matchIndex > unsortLimit) 292*3117ece4Schristos && (*unsortedMark==ZSTD_DUBT_UNSORTED_MARK) ) { 293*3117ece4Schristos DEBUGLOG(7, "ZSTD_DUBT_findBestMatch: nullify last unsorted candidate %u", 294*3117ece4Schristos matchIndex); 295*3117ece4Schristos *nextCandidate = *unsortedMark = 0; 296*3117ece4Schristos } 297*3117ece4Schristos 298*3117ece4Schristos /* batch sort stacked candidates */ 299*3117ece4Schristos matchIndex = previousCandidate; 300*3117ece4Schristos while (matchIndex) { /* will end on matchIndex == 0 */ 301*3117ece4Schristos U32* const nextCandidateIdxPtr = bt + 2*(matchIndex&btMask) + 1; 302*3117ece4Schristos U32 const nextCandidateIdx = *nextCandidateIdxPtr; 303*3117ece4Schristos ZSTD_insertDUBT1(ms, matchIndex, iend, 304*3117ece4Schristos nbCandidates, unsortLimit, dictMode); 305*3117ece4Schristos matchIndex = nextCandidateIdx; 306*3117ece4Schristos nbCandidates++; 307*3117ece4Schristos } 308*3117ece4Schristos 309*3117ece4Schristos /* find longest match */ 310*3117ece4Schristos { size_t commonLengthSmaller = 0, commonLengthLarger = 0; 311*3117ece4Schristos const BYTE* const dictBase = ms->window.dictBase; 312*3117ece4Schristos const U32 dictLimit = ms->window.dictLimit; 313*3117ece4Schristos const BYTE* const dictEnd = dictBase + dictLimit; 314*3117ece4Schristos const BYTE* const prefixStart = base + dictLimit; 315*3117ece4Schristos U32* smallerPtr = bt + 2*(curr&btMask); 316*3117ece4Schristos U32* largerPtr = bt + 2*(curr&btMask) + 1; 317*3117ece4Schristos U32 matchEndIdx = curr + 8 + 1; 318*3117ece4Schristos U32 dummy32; /* to be nullified at the end */ 319*3117ece4Schristos size_t bestLength = 0; 320*3117ece4Schristos 321*3117ece4Schristos matchIndex = hashTable[h]; 322*3117ece4Schristos hashTable[h] = curr; /* Update Hash Table */ 323*3117ece4Schristos 324*3117ece4Schristos for (; nbCompares && (matchIndex > windowLow); --nbCompares) { 325*3117ece4Schristos U32* const nextPtr = bt + 2*(matchIndex & btMask); 326*3117ece4Schristos size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ 327*3117ece4Schristos const BYTE* match; 328*3117ece4Schristos 329*3117ece4Schristos if ((dictMode != ZSTD_extDict) || (matchIndex+matchLength >= dictLimit)) { 330*3117ece4Schristos match = base + matchIndex; 331*3117ece4Schristos matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend); 332*3117ece4Schristos } else { 333*3117ece4Schristos match = dictBase + matchIndex; 334*3117ece4Schristos matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart); 335*3117ece4Schristos if (matchIndex+matchLength >= dictLimit) 336*3117ece4Schristos match = base + matchIndex; /* to prepare for next usage of match[matchLength] */ 337*3117ece4Schristos } 338*3117ece4Schristos 339*3117ece4Schristos if (matchLength > bestLength) { 340*3117ece4Schristos if (matchLength > matchEndIdx - matchIndex) 341*3117ece4Schristos matchEndIdx = matchIndex + (U32)matchLength; 342*3117ece4Schristos if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) ) 343*3117ece4Schristos bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex); 344*3117ece4Schristos if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ 345*3117ece4Schristos if (dictMode == ZSTD_dictMatchState) { 346*3117ece4Schristos nbCompares = 0; /* in addition to avoiding checking any 347*3117ece4Schristos * further in this loop, make sure we 348*3117ece4Schristos * skip checking in the dictionary. */ 349*3117ece4Schristos } 350*3117ece4Schristos break; /* drop, to guarantee consistency (miss a little bit of compression) */ 351*3117ece4Schristos } 352*3117ece4Schristos } 353*3117ece4Schristos 354*3117ece4Schristos if (match[matchLength] < ip[matchLength]) { 355*3117ece4Schristos /* match is smaller than current */ 356*3117ece4Schristos *smallerPtr = matchIndex; /* update smaller idx */ 357*3117ece4Schristos commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ 358*3117ece4Schristos if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop the search */ 359*3117ece4Schristos smallerPtr = nextPtr+1; /* new "smaller" => larger of match */ 360*3117ece4Schristos matchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */ 361*3117ece4Schristos } else { 362*3117ece4Schristos /* match is larger than current */ 363*3117ece4Schristos *largerPtr = matchIndex; 364*3117ece4Schristos commonLengthLarger = matchLength; 365*3117ece4Schristos if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop the search */ 366*3117ece4Schristos largerPtr = nextPtr; 367*3117ece4Schristos matchIndex = nextPtr[0]; 368*3117ece4Schristos } } 369*3117ece4Schristos 370*3117ece4Schristos *smallerPtr = *largerPtr = 0; 371*3117ece4Schristos 372*3117ece4Schristos assert(nbCompares <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */ 373*3117ece4Schristos if (dictMode == ZSTD_dictMatchState && nbCompares) { 374*3117ece4Schristos bestLength = ZSTD_DUBT_findBetterDictMatch( 375*3117ece4Schristos ms, ip, iend, 376*3117ece4Schristos offBasePtr, bestLength, nbCompares, 377*3117ece4Schristos mls, dictMode); 378*3117ece4Schristos } 379*3117ece4Schristos 380*3117ece4Schristos assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */ 381*3117ece4Schristos ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */ 382*3117ece4Schristos if (bestLength >= MINMATCH) { 383*3117ece4Schristos U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex; 384*3117ece4Schristos DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)", 385*3117ece4Schristos curr, (U32)bestLength, (U32)*offBasePtr, mIndex); 386*3117ece4Schristos } 387*3117ece4Schristos return bestLength; 388*3117ece4Schristos } 389*3117ece4Schristos } 390*3117ece4Schristos 391*3117ece4Schristos 392*3117ece4Schristos /** ZSTD_BtFindBestMatch() : Tree updater, providing best match */ 393*3117ece4Schristos FORCE_INLINE_TEMPLATE 394*3117ece4Schristos ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 395*3117ece4Schristos size_t ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms, 396*3117ece4Schristos const BYTE* const ip, const BYTE* const iLimit, 397*3117ece4Schristos size_t* offBasePtr, 398*3117ece4Schristos const U32 mls /* template */, 399*3117ece4Schristos const ZSTD_dictMode_e dictMode) 400*3117ece4Schristos { 401*3117ece4Schristos DEBUGLOG(7, "ZSTD_BtFindBestMatch"); 402*3117ece4Schristos if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */ 403*3117ece4Schristos ZSTD_updateDUBT(ms, ip, iLimit, mls); 404*3117ece4Schristos return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode); 405*3117ece4Schristos } 406*3117ece4Schristos 407*3117ece4Schristos /*********************************** 408*3117ece4Schristos * Dedicated dict search 409*3117ece4Schristos ***********************************/ 410*3117ece4Schristos 411*3117ece4Schristos void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip) 412*3117ece4Schristos { 413*3117ece4Schristos const BYTE* const base = ms->window.base; 414*3117ece4Schristos U32 const target = (U32)(ip - base); 415*3117ece4Schristos U32* const hashTable = ms->hashTable; 416*3117ece4Schristos U32* const chainTable = ms->chainTable; 417*3117ece4Schristos U32 const chainSize = 1 << ms->cParams.chainLog; 418*3117ece4Schristos U32 idx = ms->nextToUpdate; 419*3117ece4Schristos U32 const minChain = chainSize < target - idx ? target - chainSize : idx; 420*3117ece4Schristos U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG; 421*3117ece4Schristos U32 const cacheSize = bucketSize - 1; 422*3117ece4Schristos U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize; 423*3117ece4Schristos U32 const chainLimit = chainAttempts > 255 ? 255 : chainAttempts; 424*3117ece4Schristos 425*3117ece4Schristos /* We know the hashtable is oversized by a factor of `bucketSize`. 426*3117ece4Schristos * We are going to temporarily pretend `bucketSize == 1`, keeping only a 427*3117ece4Schristos * single entry. We will use the rest of the space to construct a temporary 428*3117ece4Schristos * chaintable. 429*3117ece4Schristos */ 430*3117ece4Schristos U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG; 431*3117ece4Schristos U32* const tmpHashTable = hashTable; 432*3117ece4Schristos U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog); 433*3117ece4Schristos U32 const tmpChainSize = (U32)((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog; 434*3117ece4Schristos U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx; 435*3117ece4Schristos U32 hashIdx; 436*3117ece4Schristos 437*3117ece4Schristos assert(ms->cParams.chainLog <= 24); 438*3117ece4Schristos assert(ms->cParams.hashLog > ms->cParams.chainLog); 439*3117ece4Schristos assert(idx != 0); 440*3117ece4Schristos assert(tmpMinChain <= minChain); 441*3117ece4Schristos 442*3117ece4Schristos /* fill conventional hash table and conventional chain table */ 443*3117ece4Schristos for ( ; idx < target; idx++) { 444*3117ece4Schristos U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch); 445*3117ece4Schristos if (idx >= tmpMinChain) { 446*3117ece4Schristos tmpChainTable[idx - tmpMinChain] = hashTable[h]; 447*3117ece4Schristos } 448*3117ece4Schristos tmpHashTable[h] = idx; 449*3117ece4Schristos } 450*3117ece4Schristos 451*3117ece4Schristos /* sort chains into ddss chain table */ 452*3117ece4Schristos { 453*3117ece4Schristos U32 chainPos = 0; 454*3117ece4Schristos for (hashIdx = 0; hashIdx < (1U << hashLog); hashIdx++) { 455*3117ece4Schristos U32 count; 456*3117ece4Schristos U32 countBeyondMinChain = 0; 457*3117ece4Schristos U32 i = tmpHashTable[hashIdx]; 458*3117ece4Schristos for (count = 0; i >= tmpMinChain && count < cacheSize; count++) { 459*3117ece4Schristos /* skip through the chain to the first position that won't be 460*3117ece4Schristos * in the hash cache bucket */ 461*3117ece4Schristos if (i < minChain) { 462*3117ece4Schristos countBeyondMinChain++; 463*3117ece4Schristos } 464*3117ece4Schristos i = tmpChainTable[i - tmpMinChain]; 465*3117ece4Schristos } 466*3117ece4Schristos if (count == cacheSize) { 467*3117ece4Schristos for (count = 0; count < chainLimit;) { 468*3117ece4Schristos if (i < minChain) { 469*3117ece4Schristos if (!i || ++countBeyondMinChain > cacheSize) { 470*3117ece4Schristos /* only allow pulling `cacheSize` number of entries 471*3117ece4Schristos * into the cache or chainTable beyond `minChain`, 472*3117ece4Schristos * to replace the entries pulled out of the 473*3117ece4Schristos * chainTable into the cache. This lets us reach 474*3117ece4Schristos * back further without increasing the total number 475*3117ece4Schristos * of entries in the chainTable, guaranteeing the 476*3117ece4Schristos * DDSS chain table will fit into the space 477*3117ece4Schristos * allocated for the regular one. */ 478*3117ece4Schristos break; 479*3117ece4Schristos } 480*3117ece4Schristos } 481*3117ece4Schristos chainTable[chainPos++] = i; 482*3117ece4Schristos count++; 483*3117ece4Schristos if (i < tmpMinChain) { 484*3117ece4Schristos break; 485*3117ece4Schristos } 486*3117ece4Schristos i = tmpChainTable[i - tmpMinChain]; 487*3117ece4Schristos } 488*3117ece4Schristos } else { 489*3117ece4Schristos count = 0; 490*3117ece4Schristos } 491*3117ece4Schristos if (count) { 492*3117ece4Schristos tmpHashTable[hashIdx] = ((chainPos - count) << 8) + count; 493*3117ece4Schristos } else { 494*3117ece4Schristos tmpHashTable[hashIdx] = 0; 495*3117ece4Schristos } 496*3117ece4Schristos } 497*3117ece4Schristos assert(chainPos <= chainSize); /* I believe this is guaranteed... */ 498*3117ece4Schristos } 499*3117ece4Schristos 500*3117ece4Schristos /* move chain pointers into the last entry of each hash bucket */ 501*3117ece4Schristos for (hashIdx = (1 << hashLog); hashIdx; ) { 502*3117ece4Schristos U32 const bucketIdx = --hashIdx << ZSTD_LAZY_DDSS_BUCKET_LOG; 503*3117ece4Schristos U32 const chainPackedPointer = tmpHashTable[hashIdx]; 504*3117ece4Schristos U32 i; 505*3117ece4Schristos for (i = 0; i < cacheSize; i++) { 506*3117ece4Schristos hashTable[bucketIdx + i] = 0; 507*3117ece4Schristos } 508*3117ece4Schristos hashTable[bucketIdx + bucketSize - 1] = chainPackedPointer; 509*3117ece4Schristos } 510*3117ece4Schristos 511*3117ece4Schristos /* fill the buckets of the hash table */ 512*3117ece4Schristos for (idx = ms->nextToUpdate; idx < target; idx++) { 513*3117ece4Schristos U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch) 514*3117ece4Schristos << ZSTD_LAZY_DDSS_BUCKET_LOG; 515*3117ece4Schristos U32 i; 516*3117ece4Schristos /* Shift hash cache down 1. */ 517*3117ece4Schristos for (i = cacheSize - 1; i; i--) 518*3117ece4Schristos hashTable[h + i] = hashTable[h + i - 1]; 519*3117ece4Schristos hashTable[h] = idx; 520*3117ece4Schristos } 521*3117ece4Schristos 522*3117ece4Schristos ms->nextToUpdate = target; 523*3117ece4Schristos } 524*3117ece4Schristos 525*3117ece4Schristos /* Returns the longest match length found in the dedicated dict search structure. 526*3117ece4Schristos * If none are longer than the argument ml, then ml will be returned. 527*3117ece4Schristos */ 528*3117ece4Schristos FORCE_INLINE_TEMPLATE 529*3117ece4Schristos size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts, 530*3117ece4Schristos const ZSTD_matchState_t* const dms, 531*3117ece4Schristos const BYTE* const ip, const BYTE* const iLimit, 532*3117ece4Schristos const BYTE* const prefixStart, const U32 curr, 533*3117ece4Schristos const U32 dictLimit, const size_t ddsIdx) { 534*3117ece4Schristos const U32 ddsLowestIndex = dms->window.dictLimit; 535*3117ece4Schristos const BYTE* const ddsBase = dms->window.base; 536*3117ece4Schristos const BYTE* const ddsEnd = dms->window.nextSrc; 537*3117ece4Schristos const U32 ddsSize = (U32)(ddsEnd - ddsBase); 538*3117ece4Schristos const U32 ddsIndexDelta = dictLimit - ddsSize; 539*3117ece4Schristos const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG); 540*3117ece4Schristos const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1; 541*3117ece4Schristos U32 ddsAttempt; 542*3117ece4Schristos U32 matchIndex; 543*3117ece4Schristos 544*3117ece4Schristos for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) { 545*3117ece4Schristos PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]); 546*3117ece4Schristos } 547*3117ece4Schristos 548*3117ece4Schristos { 549*3117ece4Schristos U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1]; 550*3117ece4Schristos U32 const chainIndex = chainPackedPointer >> 8; 551*3117ece4Schristos 552*3117ece4Schristos PREFETCH_L1(&dms->chainTable[chainIndex]); 553*3117ece4Schristos } 554*3117ece4Schristos 555*3117ece4Schristos for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) { 556*3117ece4Schristos size_t currentMl=0; 557*3117ece4Schristos const BYTE* match; 558*3117ece4Schristos matchIndex = dms->hashTable[ddsIdx + ddsAttempt]; 559*3117ece4Schristos match = ddsBase + matchIndex; 560*3117ece4Schristos 561*3117ece4Schristos if (!matchIndex) { 562*3117ece4Schristos return ml; 563*3117ece4Schristos } 564*3117ece4Schristos 565*3117ece4Schristos /* guaranteed by table construction */ 566*3117ece4Schristos (void)ddsLowestIndex; 567*3117ece4Schristos assert(matchIndex >= ddsLowestIndex); 568*3117ece4Schristos assert(match+4 <= ddsEnd); 569*3117ece4Schristos if (MEM_read32(match) == MEM_read32(ip)) { 570*3117ece4Schristos /* assumption : matchIndex <= dictLimit-4 (by table construction) */ 571*3117ece4Schristos currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4; 572*3117ece4Schristos } 573*3117ece4Schristos 574*3117ece4Schristos /* save best solution */ 575*3117ece4Schristos if (currentMl > ml) { 576*3117ece4Schristos ml = currentMl; 577*3117ece4Schristos *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta)); 578*3117ece4Schristos if (ip+currentMl == iLimit) { 579*3117ece4Schristos /* best possible, avoids read overflow on next attempt */ 580*3117ece4Schristos return ml; 581*3117ece4Schristos } 582*3117ece4Schristos } 583*3117ece4Schristos } 584*3117ece4Schristos 585*3117ece4Schristos { 586*3117ece4Schristos U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1]; 587*3117ece4Schristos U32 chainIndex = chainPackedPointer >> 8; 588*3117ece4Schristos U32 const chainLength = chainPackedPointer & 0xFF; 589*3117ece4Schristos U32 const chainAttempts = nbAttempts - ddsAttempt; 590*3117ece4Schristos U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts; 591*3117ece4Schristos U32 chainAttempt; 592*3117ece4Schristos 593*3117ece4Schristos for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) { 594*3117ece4Schristos PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]); 595*3117ece4Schristos } 596*3117ece4Schristos 597*3117ece4Schristos for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) { 598*3117ece4Schristos size_t currentMl=0; 599*3117ece4Schristos const BYTE* match; 600*3117ece4Schristos matchIndex = dms->chainTable[chainIndex]; 601*3117ece4Schristos match = ddsBase + matchIndex; 602*3117ece4Schristos 603*3117ece4Schristos /* guaranteed by table construction */ 604*3117ece4Schristos assert(matchIndex >= ddsLowestIndex); 605*3117ece4Schristos assert(match+4 <= ddsEnd); 606*3117ece4Schristos if (MEM_read32(match) == MEM_read32(ip)) { 607*3117ece4Schristos /* assumption : matchIndex <= dictLimit-4 (by table construction) */ 608*3117ece4Schristos currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4; 609*3117ece4Schristos } 610*3117ece4Schristos 611*3117ece4Schristos /* save best solution */ 612*3117ece4Schristos if (currentMl > ml) { 613*3117ece4Schristos ml = currentMl; 614*3117ece4Schristos *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta)); 615*3117ece4Schristos if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ 616*3117ece4Schristos } 617*3117ece4Schristos } 618*3117ece4Schristos } 619*3117ece4Schristos return ml; 620*3117ece4Schristos } 621*3117ece4Schristos 622*3117ece4Schristos 623*3117ece4Schristos /* ********************************* 624*3117ece4Schristos * Hash Chain 625*3117ece4Schristos ***********************************/ 626*3117ece4Schristos #define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)] 627*3117ece4Schristos 628*3117ece4Schristos /* Update chains up to ip (excluded) 629*3117ece4Schristos Assumption : always within prefix (i.e. not within extDict) */ 630*3117ece4Schristos FORCE_INLINE_TEMPLATE 631*3117ece4Schristos ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 632*3117ece4Schristos U32 ZSTD_insertAndFindFirstIndex_internal( 633*3117ece4Schristos ZSTD_matchState_t* ms, 634*3117ece4Schristos const ZSTD_compressionParameters* const cParams, 635*3117ece4Schristos const BYTE* ip, U32 const mls, U32 const lazySkipping) 636*3117ece4Schristos { 637*3117ece4Schristos U32* const hashTable = ms->hashTable; 638*3117ece4Schristos const U32 hashLog = cParams->hashLog; 639*3117ece4Schristos U32* const chainTable = ms->chainTable; 640*3117ece4Schristos const U32 chainMask = (1 << cParams->chainLog) - 1; 641*3117ece4Schristos const BYTE* const base = ms->window.base; 642*3117ece4Schristos const U32 target = (U32)(ip - base); 643*3117ece4Schristos U32 idx = ms->nextToUpdate; 644*3117ece4Schristos 645*3117ece4Schristos while(idx < target) { /* catch up */ 646*3117ece4Schristos size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls); 647*3117ece4Schristos NEXT_IN_CHAIN(idx, chainMask) = hashTable[h]; 648*3117ece4Schristos hashTable[h] = idx; 649*3117ece4Schristos idx++; 650*3117ece4Schristos /* Stop inserting every position when in the lazy skipping mode. */ 651*3117ece4Schristos if (lazySkipping) 652*3117ece4Schristos break; 653*3117ece4Schristos } 654*3117ece4Schristos 655*3117ece4Schristos ms->nextToUpdate = target; 656*3117ece4Schristos return hashTable[ZSTD_hashPtr(ip, hashLog, mls)]; 657*3117ece4Schristos } 658*3117ece4Schristos 659*3117ece4Schristos U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) { 660*3117ece4Schristos const ZSTD_compressionParameters* const cParams = &ms->cParams; 661*3117ece4Schristos return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0); 662*3117ece4Schristos } 663*3117ece4Schristos 664*3117ece4Schristos /* inlining is important to hardwire a hot branch (template emulation) */ 665*3117ece4Schristos FORCE_INLINE_TEMPLATE 666*3117ece4Schristos ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 667*3117ece4Schristos size_t ZSTD_HcFindBestMatch( 668*3117ece4Schristos ZSTD_matchState_t* ms, 669*3117ece4Schristos const BYTE* const ip, const BYTE* const iLimit, 670*3117ece4Schristos size_t* offsetPtr, 671*3117ece4Schristos const U32 mls, const ZSTD_dictMode_e dictMode) 672*3117ece4Schristos { 673*3117ece4Schristos const ZSTD_compressionParameters* const cParams = &ms->cParams; 674*3117ece4Schristos U32* const chainTable = ms->chainTable; 675*3117ece4Schristos const U32 chainSize = (1 << cParams->chainLog); 676*3117ece4Schristos const U32 chainMask = chainSize-1; 677*3117ece4Schristos const BYTE* const base = ms->window.base; 678*3117ece4Schristos const BYTE* const dictBase = ms->window.dictBase; 679*3117ece4Schristos const U32 dictLimit = ms->window.dictLimit; 680*3117ece4Schristos const BYTE* const prefixStart = base + dictLimit; 681*3117ece4Schristos const BYTE* const dictEnd = dictBase + dictLimit; 682*3117ece4Schristos const U32 curr = (U32)(ip-base); 683*3117ece4Schristos const U32 maxDistance = 1U << cParams->windowLog; 684*3117ece4Schristos const U32 lowestValid = ms->window.lowLimit; 685*3117ece4Schristos const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid; 686*3117ece4Schristos const U32 isDictionary = (ms->loadedDictEnd != 0); 687*3117ece4Schristos const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance; 688*3117ece4Schristos const U32 minChain = curr > chainSize ? curr - chainSize : 0; 689*3117ece4Schristos U32 nbAttempts = 1U << cParams->searchLog; 690*3117ece4Schristos size_t ml=4-1; 691*3117ece4Schristos 692*3117ece4Schristos const ZSTD_matchState_t* const dms = ms->dictMatchState; 693*3117ece4Schristos const U32 ddsHashLog = dictMode == ZSTD_dedicatedDictSearch 694*3117ece4Schristos ? dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG : 0; 695*3117ece4Schristos const size_t ddsIdx = dictMode == ZSTD_dedicatedDictSearch 696*3117ece4Schristos ? ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG : 0; 697*3117ece4Schristos 698*3117ece4Schristos U32 matchIndex; 699*3117ece4Schristos 700*3117ece4Schristos if (dictMode == ZSTD_dedicatedDictSearch) { 701*3117ece4Schristos const U32* entry = &dms->hashTable[ddsIdx]; 702*3117ece4Schristos PREFETCH_L1(entry); 703*3117ece4Schristos } 704*3117ece4Schristos 705*3117ece4Schristos /* HC4 match finder */ 706*3117ece4Schristos matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping); 707*3117ece4Schristos 708*3117ece4Schristos for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) { 709*3117ece4Schristos size_t currentMl=0; 710*3117ece4Schristos if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { 711*3117ece4Schristos const BYTE* const match = base + matchIndex; 712*3117ece4Schristos assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ 713*3117ece4Schristos /* read 4B starting from (match + ml + 1 - sizeof(U32)) */ 714*3117ece4Schristos if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */ 715*3117ece4Schristos currentMl = ZSTD_count(ip, match, iLimit); 716*3117ece4Schristos } else { 717*3117ece4Schristos const BYTE* const match = dictBase + matchIndex; 718*3117ece4Schristos assert(match+4 <= dictEnd); 719*3117ece4Schristos if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */ 720*3117ece4Schristos currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4; 721*3117ece4Schristos } 722*3117ece4Schristos 723*3117ece4Schristos /* save best solution */ 724*3117ece4Schristos if (currentMl > ml) { 725*3117ece4Schristos ml = currentMl; 726*3117ece4Schristos *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); 727*3117ece4Schristos if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ 728*3117ece4Schristos } 729*3117ece4Schristos 730*3117ece4Schristos if (matchIndex <= minChain) break; 731*3117ece4Schristos matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask); 732*3117ece4Schristos } 733*3117ece4Schristos 734*3117ece4Schristos assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */ 735*3117ece4Schristos if (dictMode == ZSTD_dedicatedDictSearch) { 736*3117ece4Schristos ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms, 737*3117ece4Schristos ip, iLimit, prefixStart, curr, dictLimit, ddsIdx); 738*3117ece4Schristos } else if (dictMode == ZSTD_dictMatchState) { 739*3117ece4Schristos const U32* const dmsChainTable = dms->chainTable; 740*3117ece4Schristos const U32 dmsChainSize = (1 << dms->cParams.chainLog); 741*3117ece4Schristos const U32 dmsChainMask = dmsChainSize - 1; 742*3117ece4Schristos const U32 dmsLowestIndex = dms->window.dictLimit; 743*3117ece4Schristos const BYTE* const dmsBase = dms->window.base; 744*3117ece4Schristos const BYTE* const dmsEnd = dms->window.nextSrc; 745*3117ece4Schristos const U32 dmsSize = (U32)(dmsEnd - dmsBase); 746*3117ece4Schristos const U32 dmsIndexDelta = dictLimit - dmsSize; 747*3117ece4Schristos const U32 dmsMinChain = dmsSize > dmsChainSize ? dmsSize - dmsChainSize : 0; 748*3117ece4Schristos 749*3117ece4Schristos matchIndex = dms->hashTable[ZSTD_hashPtr(ip, dms->cParams.hashLog, mls)]; 750*3117ece4Schristos 751*3117ece4Schristos for ( ; (matchIndex>=dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) { 752*3117ece4Schristos size_t currentMl=0; 753*3117ece4Schristos const BYTE* const match = dmsBase + matchIndex; 754*3117ece4Schristos assert(match+4 <= dmsEnd); 755*3117ece4Schristos if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */ 756*3117ece4Schristos currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4; 757*3117ece4Schristos 758*3117ece4Schristos /* save best solution */ 759*3117ece4Schristos if (currentMl > ml) { 760*3117ece4Schristos ml = currentMl; 761*3117ece4Schristos assert(curr > matchIndex + dmsIndexDelta); 762*3117ece4Schristos *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta)); 763*3117ece4Schristos if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ 764*3117ece4Schristos } 765*3117ece4Schristos 766*3117ece4Schristos if (matchIndex <= dmsMinChain) break; 767*3117ece4Schristos 768*3117ece4Schristos matchIndex = dmsChainTable[matchIndex & dmsChainMask]; 769*3117ece4Schristos } 770*3117ece4Schristos } 771*3117ece4Schristos 772*3117ece4Schristos return ml; 773*3117ece4Schristos } 774*3117ece4Schristos 775*3117ece4Schristos /* ********************************* 776*3117ece4Schristos * (SIMD) Row-based matchfinder 777*3117ece4Schristos ***********************************/ 778*3117ece4Schristos /* Constants for row-based hash */ 779*3117ece4Schristos #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1) 780*3117ece4Schristos #define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */ 781*3117ece4Schristos 782*3117ece4Schristos #define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1) 783*3117ece4Schristos 784*3117ece4Schristos typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 representing a mask of matches */ 785*3117ece4Schristos 786*3117ece4Schristos /* ZSTD_VecMask_next(): 787*3117ece4Schristos * Starting from the LSB, returns the idx of the next non-zero bit. 788*3117ece4Schristos * Basically counting the nb of trailing zeroes. 789*3117ece4Schristos */ 790*3117ece4Schristos MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) { 791*3117ece4Schristos return ZSTD_countTrailingZeros64(val); 792*3117ece4Schristos } 793*3117ece4Schristos 794*3117ece4Schristos /* ZSTD_row_nextIndex(): 795*3117ece4Schristos * Returns the next index to insert at within a tagTable row, and updates the "head" 796*3117ece4Schristos * value to reflect the update. Essentially cycles backwards from [1, {entries per row}) 797*3117ece4Schristos */ 798*3117ece4Schristos FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) { 799*3117ece4Schristos U32 next = (*tagRow-1) & rowMask; 800*3117ece4Schristos next += (next == 0) ? rowMask : 0; /* skip first position */ 801*3117ece4Schristos *tagRow = (BYTE)next; 802*3117ece4Schristos return next; 803*3117ece4Schristos } 804*3117ece4Schristos 805*3117ece4Schristos /* ZSTD_isAligned(): 806*3117ece4Schristos * Checks that a pointer is aligned to "align" bytes which must be a power of 2. 807*3117ece4Schristos */ 808*3117ece4Schristos MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) { 809*3117ece4Schristos assert((align & (align - 1)) == 0); 810*3117ece4Schristos return (((size_t)ptr) & (align - 1)) == 0; 811*3117ece4Schristos } 812*3117ece4Schristos 813*3117ece4Schristos /* ZSTD_row_prefetch(): 814*3117ece4Schristos * Performs prefetching for the hashTable and tagTable at a given row. 815*3117ece4Schristos */ 816*3117ece4Schristos FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) { 817*3117ece4Schristos PREFETCH_L1(hashTable + relRow); 818*3117ece4Schristos if (rowLog >= 5) { 819*3117ece4Schristos PREFETCH_L1(hashTable + relRow + 16); 820*3117ece4Schristos /* Note: prefetching more of the hash table does not appear to be beneficial for 128-entry rows */ 821*3117ece4Schristos } 822*3117ece4Schristos PREFETCH_L1(tagTable + relRow); 823*3117ece4Schristos if (rowLog == 6) { 824*3117ece4Schristos PREFETCH_L1(tagTable + relRow + 32); 825*3117ece4Schristos } 826*3117ece4Schristos assert(rowLog == 4 || rowLog == 5 || rowLog == 6); 827*3117ece4Schristos assert(ZSTD_isAligned(hashTable + relRow, 64)); /* prefetched hash row always 64-byte aligned */ 828*3117ece4Schristos assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on correct multiple of bytes (32,64,128) */ 829*3117ece4Schristos } 830*3117ece4Schristos 831*3117ece4Schristos /* ZSTD_row_fillHashCache(): 832*3117ece4Schristos * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries, 833*3117ece4Schristos * but not beyond iLimit. 834*3117ece4Schristos */ 835*3117ece4Schristos FORCE_INLINE_TEMPLATE 836*3117ece4Schristos ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 837*3117ece4Schristos void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base, 838*3117ece4Schristos U32 const rowLog, U32 const mls, 839*3117ece4Schristos U32 idx, const BYTE* const iLimit) 840*3117ece4Schristos { 841*3117ece4Schristos U32 const* const hashTable = ms->hashTable; 842*3117ece4Schristos BYTE const* const tagTable = ms->tagTable; 843*3117ece4Schristos U32 const hashLog = ms->rowHashLog; 844*3117ece4Schristos U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1); 845*3117ece4Schristos U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch); 846*3117ece4Schristos 847*3117ece4Schristos for (; idx < lim; ++idx) { 848*3117ece4Schristos U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt); 849*3117ece4Schristos U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; 850*3117ece4Schristos ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); 851*3117ece4Schristos ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash; 852*3117ece4Schristos } 853*3117ece4Schristos 854*3117ece4Schristos DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[0], ms->hashCache[1], 855*3117ece4Schristos ms->hashCache[2], ms->hashCache[3], ms->hashCache[4], 856*3117ece4Schristos ms->hashCache[5], ms->hashCache[6], ms->hashCache[7]); 857*3117ece4Schristos } 858*3117ece4Schristos 859*3117ece4Schristos /* ZSTD_row_nextCachedHash(): 860*3117ece4Schristos * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at 861*3117ece4Schristos * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable. 862*3117ece4Schristos */ 863*3117ece4Schristos FORCE_INLINE_TEMPLATE 864*3117ece4Schristos ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 865*3117ece4Schristos U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable, 866*3117ece4Schristos BYTE const* tagTable, BYTE const* base, 867*3117ece4Schristos U32 idx, U32 const hashLog, 868*3117ece4Schristos U32 const rowLog, U32 const mls, 869*3117ece4Schristos U64 const hashSalt) 870*3117ece4Schristos { 871*3117ece4Schristos U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt); 872*3117ece4Schristos U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; 873*3117ece4Schristos ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); 874*3117ece4Schristos { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK]; 875*3117ece4Schristos cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash; 876*3117ece4Schristos return hash; 877*3117ece4Schristos } 878*3117ece4Schristos } 879*3117ece4Schristos 880*3117ece4Schristos /* ZSTD_row_update_internalImpl(): 881*3117ece4Schristos * Updates the hash table with positions starting from updateStartIdx until updateEndIdx. 882*3117ece4Schristos */ 883*3117ece4Schristos FORCE_INLINE_TEMPLATE 884*3117ece4Schristos ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 885*3117ece4Schristos void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, 886*3117ece4Schristos U32 updateStartIdx, U32 const updateEndIdx, 887*3117ece4Schristos U32 const mls, U32 const rowLog, 888*3117ece4Schristos U32 const rowMask, U32 const useCache) 889*3117ece4Schristos { 890*3117ece4Schristos U32* const hashTable = ms->hashTable; 891*3117ece4Schristos BYTE* const tagTable = ms->tagTable; 892*3117ece4Schristos U32 const hashLog = ms->rowHashLog; 893*3117ece4Schristos const BYTE* const base = ms->window.base; 894*3117ece4Schristos 895*3117ece4Schristos DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx); 896*3117ece4Schristos for (; updateStartIdx < updateEndIdx; ++updateStartIdx) { 897*3117ece4Schristos U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt) 898*3117ece4Schristos : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt); 899*3117ece4Schristos U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; 900*3117ece4Schristos U32* const row = hashTable + relRow; 901*3117ece4Schristos BYTE* tagRow = tagTable + relRow; 902*3117ece4Schristos U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); 903*3117ece4Schristos 904*3117ece4Schristos assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt)); 905*3117ece4Schristos tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK; 906*3117ece4Schristos row[pos] = updateStartIdx; 907*3117ece4Schristos } 908*3117ece4Schristos } 909*3117ece4Schristos 910*3117ece4Schristos /* ZSTD_row_update_internal(): 911*3117ece4Schristos * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate. 912*3117ece4Schristos * Skips sections of long matches as is necessary. 913*3117ece4Schristos */ 914*3117ece4Schristos FORCE_INLINE_TEMPLATE 915*3117ece4Schristos ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 916*3117ece4Schristos void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip, 917*3117ece4Schristos U32 const mls, U32 const rowLog, 918*3117ece4Schristos U32 const rowMask, U32 const useCache) 919*3117ece4Schristos { 920*3117ece4Schristos U32 idx = ms->nextToUpdate; 921*3117ece4Schristos const BYTE* const base = ms->window.base; 922*3117ece4Schristos const U32 target = (U32)(ip - base); 923*3117ece4Schristos const U32 kSkipThreshold = 384; 924*3117ece4Schristos const U32 kMaxMatchStartPositionsToUpdate = 96; 925*3117ece4Schristos const U32 kMaxMatchEndPositionsToUpdate = 32; 926*3117ece4Schristos 927*3117ece4Schristos if (useCache) { 928*3117ece4Schristos /* Only skip positions when using hash cache, i.e. 929*3117ece4Schristos * if we are loading a dict, don't skip anything. 930*3117ece4Schristos * If we decide to skip, then we only update a set number 931*3117ece4Schristos * of positions at the beginning and end of the match. 932*3117ece4Schristos */ 933*3117ece4Schristos if (UNLIKELY(target - idx > kSkipThreshold)) { 934*3117ece4Schristos U32 const bound = idx + kMaxMatchStartPositionsToUpdate; 935*3117ece4Schristos ZSTD_row_update_internalImpl(ms, idx, bound, mls, rowLog, rowMask, useCache); 936*3117ece4Schristos idx = target - kMaxMatchEndPositionsToUpdate; 937*3117ece4Schristos ZSTD_row_fillHashCache(ms, base, rowLog, mls, idx, ip+1); 938*3117ece4Schristos } 939*3117ece4Schristos } 940*3117ece4Schristos assert(target >= idx); 941*3117ece4Schristos ZSTD_row_update_internalImpl(ms, idx, target, mls, rowLog, rowMask, useCache); 942*3117ece4Schristos ms->nextToUpdate = target; 943*3117ece4Schristos } 944*3117ece4Schristos 945*3117ece4Schristos /* ZSTD_row_update(): 946*3117ece4Schristos * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary 947*3117ece4Schristos * processing. 948*3117ece4Schristos */ 949*3117ece4Schristos void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) { 950*3117ece4Schristos const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); 951*3117ece4Schristos const U32 rowMask = (1u << rowLog) - 1; 952*3117ece4Schristos const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */); 953*3117ece4Schristos 954*3117ece4Schristos DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog); 955*3117ece4Schristos ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */); 956*3117ece4Schristos } 957*3117ece4Schristos 958*3117ece4Schristos /* Returns the mask width of bits group of which will be set to 1. Given not all 959*3117ece4Schristos * architectures have easy movemask instruction, this helps to iterate over 960*3117ece4Schristos * groups of bits easier and faster. 961*3117ece4Schristos */ 962*3117ece4Schristos FORCE_INLINE_TEMPLATE U32 963*3117ece4Schristos ZSTD_row_matchMaskGroupWidth(const U32 rowEntries) 964*3117ece4Schristos { 965*3117ece4Schristos assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); 966*3117ece4Schristos assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); 967*3117ece4Schristos (void)rowEntries; 968*3117ece4Schristos #if defined(ZSTD_ARCH_ARM_NEON) 969*3117ece4Schristos /* NEON path only works for little endian */ 970*3117ece4Schristos if (!MEM_isLittleEndian()) { 971*3117ece4Schristos return 1; 972*3117ece4Schristos } 973*3117ece4Schristos if (rowEntries == 16) { 974*3117ece4Schristos return 4; 975*3117ece4Schristos } 976*3117ece4Schristos if (rowEntries == 32) { 977*3117ece4Schristos return 2; 978*3117ece4Schristos } 979*3117ece4Schristos if (rowEntries == 64) { 980*3117ece4Schristos return 1; 981*3117ece4Schristos } 982*3117ece4Schristos #endif 983*3117ece4Schristos return 1; 984*3117ece4Schristos } 985*3117ece4Schristos 986*3117ece4Schristos #if defined(ZSTD_ARCH_X86_SSE2) 987*3117ece4Schristos FORCE_INLINE_TEMPLATE ZSTD_VecMask 988*3117ece4Schristos ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U32 head) 989*3117ece4Schristos { 990*3117ece4Schristos const __m128i comparisonMask = _mm_set1_epi8((char)tag); 991*3117ece4Schristos int matches[4] = {0}; 992*3117ece4Schristos int i; 993*3117ece4Schristos assert(nbChunks == 1 || nbChunks == 2 || nbChunks == 4); 994*3117ece4Schristos for (i=0; i<nbChunks; i++) { 995*3117ece4Schristos const __m128i chunk = _mm_loadu_si128((const __m128i*)(const void*)(src + 16*i)); 996*3117ece4Schristos const __m128i equalMask = _mm_cmpeq_epi8(chunk, comparisonMask); 997*3117ece4Schristos matches[i] = _mm_movemask_epi8(equalMask); 998*3117ece4Schristos } 999*3117ece4Schristos if (nbChunks == 1) return ZSTD_rotateRight_U16((U16)matches[0], head); 1000*3117ece4Schristos if (nbChunks == 2) return ZSTD_rotateRight_U32((U32)matches[1] << 16 | (U32)matches[0], head); 1001*3117ece4Schristos assert(nbChunks == 4); 1002*3117ece4Schristos return ZSTD_rotateRight_U64((U64)matches[3] << 48 | (U64)matches[2] << 32 | (U64)matches[1] << 16 | (U64)matches[0], head); 1003*3117ece4Schristos } 1004*3117ece4Schristos #endif 1005*3117ece4Schristos 1006*3117ece4Schristos #if defined(ZSTD_ARCH_ARM_NEON) 1007*3117ece4Schristos FORCE_INLINE_TEMPLATE ZSTD_VecMask 1008*3117ece4Schristos ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped) 1009*3117ece4Schristos { 1010*3117ece4Schristos assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); 1011*3117ece4Schristos if (rowEntries == 16) { 1012*3117ece4Schristos /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits. 1013*3117ece4Schristos * After that groups of 4 bits represent the equalMask. We lower 1014*3117ece4Schristos * all bits except the highest in these groups by doing AND with 1015*3117ece4Schristos * 0x88 = 0b10001000. 1016*3117ece4Schristos */ 1017*3117ece4Schristos const uint8x16_t chunk = vld1q_u8(src); 1018*3117ece4Schristos const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag))); 1019*3117ece4Schristos const uint8x8_t res = vshrn_n_u16(equalMask, 4); 1020*3117ece4Schristos const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0); 1021*3117ece4Schristos return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull; 1022*3117ece4Schristos } else if (rowEntries == 32) { 1023*3117ece4Schristos /* Same idea as with rowEntries == 16 but doing AND with 1024*3117ece4Schristos * 0x55 = 0b01010101. 1025*3117ece4Schristos */ 1026*3117ece4Schristos const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src); 1027*3117ece4Schristos const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]); 1028*3117ece4Schristos const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]); 1029*3117ece4Schristos const uint8x16_t dup = vdupq_n_u8(tag); 1030*3117ece4Schristos const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6); 1031*3117ece4Schristos const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6); 1032*3117ece4Schristos const uint8x8_t res = vsli_n_u8(t0, t1, 4); 1033*3117ece4Schristos const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ; 1034*3117ece4Schristos return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull; 1035*3117ece4Schristos } else { /* rowEntries == 64 */ 1036*3117ece4Schristos const uint8x16x4_t chunk = vld4q_u8(src); 1037*3117ece4Schristos const uint8x16_t dup = vdupq_n_u8(tag); 1038*3117ece4Schristos const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup); 1039*3117ece4Schristos const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup); 1040*3117ece4Schristos const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup); 1041*3117ece4Schristos const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup); 1042*3117ece4Schristos 1043*3117ece4Schristos const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1); 1044*3117ece4Schristos const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1); 1045*3117ece4Schristos const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2); 1046*3117ece4Schristos const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4); 1047*3117ece4Schristos const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4); 1048*3117ece4Schristos const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0); 1049*3117ece4Schristos return ZSTD_rotateRight_U64(matches, headGrouped); 1050*3117ece4Schristos } 1051*3117ece4Schristos } 1052*3117ece4Schristos #endif 1053*3117ece4Schristos 1054*3117ece4Schristos /* Returns a ZSTD_VecMask (U64) that has the nth group (determined by 1055*3117ece4Schristos * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag" 1056*3117ece4Schristos * matches the hash at the nth position in a row of the tagTable. 1057*3117ece4Schristos * Each row is a circular buffer beginning at the value of "headGrouped". So we 1058*3117ece4Schristos * must rotate the "matches" bitfield to match up with the actual layout of the 1059*3117ece4Schristos * entries within the hashTable */ 1060*3117ece4Schristos FORCE_INLINE_TEMPLATE ZSTD_VecMask 1061*3117ece4Schristos ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries) 1062*3117ece4Schristos { 1063*3117ece4Schristos const BYTE* const src = tagRow; 1064*3117ece4Schristos assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); 1065*3117ece4Schristos assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); 1066*3117ece4Schristos assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8); 1067*3117ece4Schristos 1068*3117ece4Schristos #if defined(ZSTD_ARCH_X86_SSE2) 1069*3117ece4Schristos 1070*3117ece4Schristos return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped); 1071*3117ece4Schristos 1072*3117ece4Schristos #else /* SW or NEON-LE */ 1073*3117ece4Schristos 1074*3117ece4Schristos # if defined(ZSTD_ARCH_ARM_NEON) 1075*3117ece4Schristos /* This NEON path only works for little endian - otherwise use SWAR below */ 1076*3117ece4Schristos if (MEM_isLittleEndian()) { 1077*3117ece4Schristos return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped); 1078*3117ece4Schristos } 1079*3117ece4Schristos # endif /* ZSTD_ARCH_ARM_NEON */ 1080*3117ece4Schristos /* SWAR */ 1081*3117ece4Schristos { const int chunkSize = sizeof(size_t); 1082*3117ece4Schristos const size_t shiftAmount = ((chunkSize * 8) - chunkSize); 1083*3117ece4Schristos const size_t xFF = ~((size_t)0); 1084*3117ece4Schristos const size_t x01 = xFF / 0xFF; 1085*3117ece4Schristos const size_t x80 = x01 << 7; 1086*3117ece4Schristos const size_t splatChar = tag * x01; 1087*3117ece4Schristos ZSTD_VecMask matches = 0; 1088*3117ece4Schristos int i = rowEntries - chunkSize; 1089*3117ece4Schristos assert((sizeof(size_t) == 4) || (sizeof(size_t) == 8)); 1090*3117ece4Schristos if (MEM_isLittleEndian()) { /* runtime check so have two loops */ 1091*3117ece4Schristos const size_t extractMagic = (xFF / 0x7F) >> chunkSize; 1092*3117ece4Schristos do { 1093*3117ece4Schristos size_t chunk = MEM_readST(&src[i]); 1094*3117ece4Schristos chunk ^= splatChar; 1095*3117ece4Schristos chunk = (((chunk | x80) - x01) | chunk) & x80; 1096*3117ece4Schristos matches <<= chunkSize; 1097*3117ece4Schristos matches |= (chunk * extractMagic) >> shiftAmount; 1098*3117ece4Schristos i -= chunkSize; 1099*3117ece4Schristos } while (i >= 0); 1100*3117ece4Schristos } else { /* big endian: reverse bits during extraction */ 1101*3117ece4Schristos const size_t msb = xFF ^ (xFF >> 1); 1102*3117ece4Schristos const size_t extractMagic = (msb / 0x1FF) | msb; 1103*3117ece4Schristos do { 1104*3117ece4Schristos size_t chunk = MEM_readST(&src[i]); 1105*3117ece4Schristos chunk ^= splatChar; 1106*3117ece4Schristos chunk = (((chunk | x80) - x01) | chunk) & x80; 1107*3117ece4Schristos matches <<= chunkSize; 1108*3117ece4Schristos matches |= ((chunk >> 7) * extractMagic) >> shiftAmount; 1109*3117ece4Schristos i -= chunkSize; 1110*3117ece4Schristos } while (i >= 0); 1111*3117ece4Schristos } 1112*3117ece4Schristos matches = ~matches; 1113*3117ece4Schristos if (rowEntries == 16) { 1114*3117ece4Schristos return ZSTD_rotateRight_U16((U16)matches, headGrouped); 1115*3117ece4Schristos } else if (rowEntries == 32) { 1116*3117ece4Schristos return ZSTD_rotateRight_U32((U32)matches, headGrouped); 1117*3117ece4Schristos } else { 1118*3117ece4Schristos return ZSTD_rotateRight_U64((U64)matches, headGrouped); 1119*3117ece4Schristos } 1120*3117ece4Schristos } 1121*3117ece4Schristos #endif 1122*3117ece4Schristos } 1123*3117ece4Schristos 1124*3117ece4Schristos /* The high-level approach of the SIMD row based match finder is as follows: 1125*3117ece4Schristos * - Figure out where to insert the new entry: 1126*3117ece4Schristos * - Generate a hash for current input posistion and split it into a one byte of tag and `rowHashLog` bits of index. 1127*3117ece4Schristos * - The hash is salted by a value that changes on every contex reset, so when the same table is used 1128*3117ece4Schristos * we will avoid collisions that would otherwise slow us down by intorducing phantom matches. 1129*3117ece4Schristos * - The hashTable is effectively split into groups or "rows" of 15 or 31 entries of U32, and the index determines 1130*3117ece4Schristos * which row to insert into. 1131*3117ece4Schristos * - Determine the correct position within the row to insert the entry into. Each row of 15 or 31 can 1132*3117ece4Schristos * be considered as a circular buffer with a "head" index that resides in the tagTable (overall 16 or 32 bytes 1133*3117ece4Schristos * per row). 1134*3117ece4Schristos * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte tag calculated for the position and 1135*3117ece4Schristos * generate a bitfield that we can cycle through to check the collisions in the hash table. 1136*3117ece4Schristos * - Pick the longest match. 1137*3117ece4Schristos * - Insert the tag into the equivalent row and position in the tagTable. 1138*3117ece4Schristos */ 1139*3117ece4Schristos FORCE_INLINE_TEMPLATE 1140*3117ece4Schristos ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 1141*3117ece4Schristos size_t ZSTD_RowFindBestMatch( 1142*3117ece4Schristos ZSTD_matchState_t* ms, 1143*3117ece4Schristos const BYTE* const ip, const BYTE* const iLimit, 1144*3117ece4Schristos size_t* offsetPtr, 1145*3117ece4Schristos const U32 mls, const ZSTD_dictMode_e dictMode, 1146*3117ece4Schristos const U32 rowLog) 1147*3117ece4Schristos { 1148*3117ece4Schristos U32* const hashTable = ms->hashTable; 1149*3117ece4Schristos BYTE* const tagTable = ms->tagTable; 1150*3117ece4Schristos U32* const hashCache = ms->hashCache; 1151*3117ece4Schristos const U32 hashLog = ms->rowHashLog; 1152*3117ece4Schristos const ZSTD_compressionParameters* const cParams = &ms->cParams; 1153*3117ece4Schristos const BYTE* const base = ms->window.base; 1154*3117ece4Schristos const BYTE* const dictBase = ms->window.dictBase; 1155*3117ece4Schristos const U32 dictLimit = ms->window.dictLimit; 1156*3117ece4Schristos const BYTE* const prefixStart = base + dictLimit; 1157*3117ece4Schristos const BYTE* const dictEnd = dictBase + dictLimit; 1158*3117ece4Schristos const U32 curr = (U32)(ip-base); 1159*3117ece4Schristos const U32 maxDistance = 1U << cParams->windowLog; 1160*3117ece4Schristos const U32 lowestValid = ms->window.lowLimit; 1161*3117ece4Schristos const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid; 1162*3117ece4Schristos const U32 isDictionary = (ms->loadedDictEnd != 0); 1163*3117ece4Schristos const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance; 1164*3117ece4Schristos const U32 rowEntries = (1U << rowLog); 1165*3117ece4Schristos const U32 rowMask = rowEntries - 1; 1166*3117ece4Schristos const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */ 1167*3117ece4Schristos const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries); 1168*3117ece4Schristos const U64 hashSalt = ms->hashSalt; 1169*3117ece4Schristos U32 nbAttempts = 1U << cappedSearchLog; 1170*3117ece4Schristos size_t ml=4-1; 1171*3117ece4Schristos U32 hash; 1172*3117ece4Schristos 1173*3117ece4Schristos /* DMS/DDS variables that may be referenced laster */ 1174*3117ece4Schristos const ZSTD_matchState_t* const dms = ms->dictMatchState; 1175*3117ece4Schristos 1176*3117ece4Schristos /* Initialize the following variables to satisfy static analyzer */ 1177*3117ece4Schristos size_t ddsIdx = 0; 1178*3117ece4Schristos U32 ddsExtraAttempts = 0; /* cctx hash tables are limited in searches, but allow extra searches into DDS */ 1179*3117ece4Schristos U32 dmsTag = 0; 1180*3117ece4Schristos U32* dmsRow = NULL; 1181*3117ece4Schristos BYTE* dmsTagRow = NULL; 1182*3117ece4Schristos 1183*3117ece4Schristos if (dictMode == ZSTD_dedicatedDictSearch) { 1184*3117ece4Schristos const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG; 1185*3117ece4Schristos { /* Prefetch DDS hashtable entry */ 1186*3117ece4Schristos ddsIdx = ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG; 1187*3117ece4Schristos PREFETCH_L1(&dms->hashTable[ddsIdx]); 1188*3117ece4Schristos } 1189*3117ece4Schristos ddsExtraAttempts = cParams->searchLog > rowLog ? 1U << (cParams->searchLog - rowLog) : 0; 1190*3117ece4Schristos } 1191*3117ece4Schristos 1192*3117ece4Schristos if (dictMode == ZSTD_dictMatchState) { 1193*3117ece4Schristos /* Prefetch DMS rows */ 1194*3117ece4Schristos U32* const dmsHashTable = dms->hashTable; 1195*3117ece4Schristos BYTE* const dmsTagTable = dms->tagTable; 1196*3117ece4Schristos U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls); 1197*3117ece4Schristos U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; 1198*3117ece4Schristos dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK; 1199*3117ece4Schristos dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow); 1200*3117ece4Schristos dmsRow = dmsHashTable + dmsRelRow; 1201*3117ece4Schristos ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog); 1202*3117ece4Schristos } 1203*3117ece4Schristos 1204*3117ece4Schristos /* Update the hashTable and tagTable up to (but not including) ip */ 1205*3117ece4Schristos if (!ms->lazySkipping) { 1206*3117ece4Schristos ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */); 1207*3117ece4Schristos hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt); 1208*3117ece4Schristos } else { 1209*3117ece4Schristos /* Stop inserting every position when in the lazy skipping mode. 1210*3117ece4Schristos * The hash cache is also not kept up to date in this mode. 1211*3117ece4Schristos */ 1212*3117ece4Schristos hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt); 1213*3117ece4Schristos ms->nextToUpdate = curr; 1214*3117ece4Schristos } 1215*3117ece4Schristos ms->hashSaltEntropy += hash; /* collect salt entropy */ 1216*3117ece4Schristos 1217*3117ece4Schristos { /* Get the hash for ip, compute the appropriate row */ 1218*3117ece4Schristos U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; 1219*3117ece4Schristos U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK; 1220*3117ece4Schristos U32* const row = hashTable + relRow; 1221*3117ece4Schristos BYTE* tagRow = (BYTE*)(tagTable + relRow); 1222*3117ece4Schristos U32 const headGrouped = (*tagRow & rowMask) * groupWidth; 1223*3117ece4Schristos U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; 1224*3117ece4Schristos size_t numMatches = 0; 1225*3117ece4Schristos size_t currMatch = 0; 1226*3117ece4Schristos ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries); 1227*3117ece4Schristos 1228*3117ece4Schristos /* Cycle through the matches and prefetch */ 1229*3117ece4Schristos for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) { 1230*3117ece4Schristos U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask; 1231*3117ece4Schristos U32 const matchIndex = row[matchPos]; 1232*3117ece4Schristos if(matchPos == 0) continue; 1233*3117ece4Schristos assert(numMatches < rowEntries); 1234*3117ece4Schristos if (matchIndex < lowLimit) 1235*3117ece4Schristos break; 1236*3117ece4Schristos if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { 1237*3117ece4Schristos PREFETCH_L1(base + matchIndex); 1238*3117ece4Schristos } else { 1239*3117ece4Schristos PREFETCH_L1(dictBase + matchIndex); 1240*3117ece4Schristos } 1241*3117ece4Schristos matchBuffer[numMatches++] = matchIndex; 1242*3117ece4Schristos --nbAttempts; 1243*3117ece4Schristos } 1244*3117ece4Schristos 1245*3117ece4Schristos /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop 1246*3117ece4Schristos in ZSTD_row_update_internal() at the next search. */ 1247*3117ece4Schristos { 1248*3117ece4Schristos U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); 1249*3117ece4Schristos tagRow[pos] = (BYTE)tag; 1250*3117ece4Schristos row[pos] = ms->nextToUpdate++; 1251*3117ece4Schristos } 1252*3117ece4Schristos 1253*3117ece4Schristos /* Return the longest match */ 1254*3117ece4Schristos for (; currMatch < numMatches; ++currMatch) { 1255*3117ece4Schristos U32 const matchIndex = matchBuffer[currMatch]; 1256*3117ece4Schristos size_t currentMl=0; 1257*3117ece4Schristos assert(matchIndex < curr); 1258*3117ece4Schristos assert(matchIndex >= lowLimit); 1259*3117ece4Schristos 1260*3117ece4Schristos if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { 1261*3117ece4Schristos const BYTE* const match = base + matchIndex; 1262*3117ece4Schristos assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ 1263*3117ece4Schristos /* read 4B starting from (match + ml + 1 - sizeof(U32)) */ 1264*3117ece4Schristos if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */ 1265*3117ece4Schristos currentMl = ZSTD_count(ip, match, iLimit); 1266*3117ece4Schristos } else { 1267*3117ece4Schristos const BYTE* const match = dictBase + matchIndex; 1268*3117ece4Schristos assert(match+4 <= dictEnd); 1269*3117ece4Schristos if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */ 1270*3117ece4Schristos currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4; 1271*3117ece4Schristos } 1272*3117ece4Schristos 1273*3117ece4Schristos /* Save best solution */ 1274*3117ece4Schristos if (currentMl > ml) { 1275*3117ece4Schristos ml = currentMl; 1276*3117ece4Schristos *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); 1277*3117ece4Schristos if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ 1278*3117ece4Schristos } 1279*3117ece4Schristos } 1280*3117ece4Schristos } 1281*3117ece4Schristos 1282*3117ece4Schristos assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */ 1283*3117ece4Schristos if (dictMode == ZSTD_dedicatedDictSearch) { 1284*3117ece4Schristos ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms, 1285*3117ece4Schristos ip, iLimit, prefixStart, curr, dictLimit, ddsIdx); 1286*3117ece4Schristos } else if (dictMode == ZSTD_dictMatchState) { 1287*3117ece4Schristos /* TODO: Measure and potentially add prefetching to DMS */ 1288*3117ece4Schristos const U32 dmsLowestIndex = dms->window.dictLimit; 1289*3117ece4Schristos const BYTE* const dmsBase = dms->window.base; 1290*3117ece4Schristos const BYTE* const dmsEnd = dms->window.nextSrc; 1291*3117ece4Schristos const U32 dmsSize = (U32)(dmsEnd - dmsBase); 1292*3117ece4Schristos const U32 dmsIndexDelta = dictLimit - dmsSize; 1293*3117ece4Schristos 1294*3117ece4Schristos { U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth; 1295*3117ece4Schristos U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; 1296*3117ece4Schristos size_t numMatches = 0; 1297*3117ece4Schristos size_t currMatch = 0; 1298*3117ece4Schristos ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries); 1299*3117ece4Schristos 1300*3117ece4Schristos for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) { 1301*3117ece4Schristos U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask; 1302*3117ece4Schristos U32 const matchIndex = dmsRow[matchPos]; 1303*3117ece4Schristos if(matchPos == 0) continue; 1304*3117ece4Schristos if (matchIndex < dmsLowestIndex) 1305*3117ece4Schristos break; 1306*3117ece4Schristos PREFETCH_L1(dmsBase + matchIndex); 1307*3117ece4Schristos matchBuffer[numMatches++] = matchIndex; 1308*3117ece4Schristos --nbAttempts; 1309*3117ece4Schristos } 1310*3117ece4Schristos 1311*3117ece4Schristos /* Return the longest match */ 1312*3117ece4Schristos for (; currMatch < numMatches; ++currMatch) { 1313*3117ece4Schristos U32 const matchIndex = matchBuffer[currMatch]; 1314*3117ece4Schristos size_t currentMl=0; 1315*3117ece4Schristos assert(matchIndex >= dmsLowestIndex); 1316*3117ece4Schristos assert(matchIndex < curr); 1317*3117ece4Schristos 1318*3117ece4Schristos { const BYTE* const match = dmsBase + matchIndex; 1319*3117ece4Schristos assert(match+4 <= dmsEnd); 1320*3117ece4Schristos if (MEM_read32(match) == MEM_read32(ip)) 1321*3117ece4Schristos currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4; 1322*3117ece4Schristos } 1323*3117ece4Schristos 1324*3117ece4Schristos if (currentMl > ml) { 1325*3117ece4Schristos ml = currentMl; 1326*3117ece4Schristos assert(curr > matchIndex + dmsIndexDelta); 1327*3117ece4Schristos *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta)); 1328*3117ece4Schristos if (ip+currentMl == iLimit) break; 1329*3117ece4Schristos } 1330*3117ece4Schristos } 1331*3117ece4Schristos } 1332*3117ece4Schristos } 1333*3117ece4Schristos return ml; 1334*3117ece4Schristos } 1335*3117ece4Schristos 1336*3117ece4Schristos 1337*3117ece4Schristos /** 1338*3117ece4Schristos * Generate search functions templated on (dictMode, mls, rowLog). 1339*3117ece4Schristos * These functions are outlined for code size & compilation time. 1340*3117ece4Schristos * ZSTD_searchMax() dispatches to the correct implementation function. 1341*3117ece4Schristos * 1342*3117ece4Schristos * TODO: The start of the search function involves loading and calculating a 1343*3117ece4Schristos * bunch of constants from the ZSTD_matchState_t. These computations could be 1344*3117ece4Schristos * done in an initialization function, and saved somewhere in the match state. 1345*3117ece4Schristos * Then we could pass a pointer to the saved state instead of the match state, 1346*3117ece4Schristos * and avoid duplicate computations. 1347*3117ece4Schristos * 1348*3117ece4Schristos * TODO: Move the match re-winding into searchMax. This improves compression 1349*3117ece4Schristos * ratio, and unlocks further simplifications with the next TODO. 1350*3117ece4Schristos * 1351*3117ece4Schristos * TODO: Try moving the repcode search into searchMax. After the re-winding 1352*3117ece4Schristos * and repcode search are in searchMax, there is no more logic in the match 1353*3117ece4Schristos * finder loop that requires knowledge about the dictMode. So we should be 1354*3117ece4Schristos * able to avoid force inlining it, and we can join the extDict loop with 1355*3117ece4Schristos * the single segment loop. It should go in searchMax instead of its own 1356*3117ece4Schristos * function to avoid having multiple virtual function calls per search. 1357*3117ece4Schristos */ 1358*3117ece4Schristos 1359*3117ece4Schristos #define ZSTD_BT_SEARCH_FN(dictMode, mls) ZSTD_BtFindBestMatch_##dictMode##_##mls 1360*3117ece4Schristos #define ZSTD_HC_SEARCH_FN(dictMode, mls) ZSTD_HcFindBestMatch_##dictMode##_##mls 1361*3117ece4Schristos #define ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog 1362*3117ece4Schristos 1363*3117ece4Schristos #define ZSTD_SEARCH_FN_ATTRS FORCE_NOINLINE 1364*3117ece4Schristos 1365*3117ece4Schristos #define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls) \ 1366*3117ece4Schristos ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)( \ 1367*3117ece4Schristos ZSTD_matchState_t* ms, \ 1368*3117ece4Schristos const BYTE* ip, const BYTE* const iLimit, \ 1369*3117ece4Schristos size_t* offBasePtr) \ 1370*3117ece4Schristos { \ 1371*3117ece4Schristos assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \ 1372*3117ece4Schristos return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_##dictMode); \ 1373*3117ece4Schristos } \ 1374*3117ece4Schristos 1375*3117ece4Schristos #define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls) \ 1376*3117ece4Schristos ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)( \ 1377*3117ece4Schristos ZSTD_matchState_t* ms, \ 1378*3117ece4Schristos const BYTE* ip, const BYTE* const iLimit, \ 1379*3117ece4Schristos size_t* offsetPtr) \ 1380*3117ece4Schristos { \ 1381*3117ece4Schristos assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \ 1382*3117ece4Schristos return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \ 1383*3117ece4Schristos } \ 1384*3117ece4Schristos 1385*3117ece4Schristos #define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) \ 1386*3117ece4Schristos ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)( \ 1387*3117ece4Schristos ZSTD_matchState_t* ms, \ 1388*3117ece4Schristos const BYTE* ip, const BYTE* const iLimit, \ 1389*3117ece4Schristos size_t* offsetPtr) \ 1390*3117ece4Schristos { \ 1391*3117ece4Schristos assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \ 1392*3117ece4Schristos assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog); \ 1393*3117ece4Schristos return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \ 1394*3117ece4Schristos } \ 1395*3117ece4Schristos 1396*3117ece4Schristos #define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \ 1397*3117ece4Schristos X(dictMode, mls, 4) \ 1398*3117ece4Schristos X(dictMode, mls, 5) \ 1399*3117ece4Schristos X(dictMode, mls, 6) 1400*3117ece4Schristos 1401*3117ece4Schristos #define ZSTD_FOR_EACH_MLS_ROWLOG(X, dictMode) \ 1402*3117ece4Schristos ZSTD_FOR_EACH_ROWLOG(X, dictMode, 4) \ 1403*3117ece4Schristos ZSTD_FOR_EACH_ROWLOG(X, dictMode, 5) \ 1404*3117ece4Schristos ZSTD_FOR_EACH_ROWLOG(X, dictMode, 6) 1405*3117ece4Schristos 1406*3117ece4Schristos #define ZSTD_FOR_EACH_MLS(X, dictMode) \ 1407*3117ece4Schristos X(dictMode, 4) \ 1408*3117ece4Schristos X(dictMode, 5) \ 1409*3117ece4Schristos X(dictMode, 6) 1410*3117ece4Schristos 1411*3117ece4Schristos #define ZSTD_FOR_EACH_DICT_MODE(X, ...) \ 1412*3117ece4Schristos X(__VA_ARGS__, noDict) \ 1413*3117ece4Schristos X(__VA_ARGS__, extDict) \ 1414*3117ece4Schristos X(__VA_ARGS__, dictMatchState) \ 1415*3117ece4Schristos X(__VA_ARGS__, dedicatedDictSearch) 1416*3117ece4Schristos 1417*3117ece4Schristos /* Generate row search fns for each combination of (dictMode, mls, rowLog) */ 1418*3117ece4Schristos ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_SEARCH_FN) 1419*3117ece4Schristos /* Generate binary Tree search fns for each combination of (dictMode, mls) */ 1420*3117ece4Schristos ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_SEARCH_FN) 1421*3117ece4Schristos /* Generate hash chain search fns for each combination of (dictMode, mls) */ 1422*3117ece4Schristos ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_SEARCH_FN) 1423*3117ece4Schristos 1424*3117ece4Schristos typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e; 1425*3117ece4Schristos 1426*3117ece4Schristos #define GEN_ZSTD_CALL_BT_SEARCH_FN(dictMode, mls) \ 1427*3117ece4Schristos case mls: \ 1428*3117ece4Schristos return ZSTD_BT_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr); 1429*3117ece4Schristos #define GEN_ZSTD_CALL_HC_SEARCH_FN(dictMode, mls) \ 1430*3117ece4Schristos case mls: \ 1431*3117ece4Schristos return ZSTD_HC_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr); 1432*3117ece4Schristos #define GEN_ZSTD_CALL_ROW_SEARCH_FN(dictMode, mls, rowLog) \ 1433*3117ece4Schristos case rowLog: \ 1434*3117ece4Schristos return ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(ms, ip, iend, offsetPtr); 1435*3117ece4Schristos 1436*3117ece4Schristos #define ZSTD_SWITCH_MLS(X, dictMode) \ 1437*3117ece4Schristos switch (mls) { \ 1438*3117ece4Schristos ZSTD_FOR_EACH_MLS(X, dictMode) \ 1439*3117ece4Schristos } 1440*3117ece4Schristos 1441*3117ece4Schristos #define ZSTD_SWITCH_ROWLOG(dictMode, mls) \ 1442*3117ece4Schristos case mls: \ 1443*3117ece4Schristos switch (rowLog) { \ 1444*3117ece4Schristos ZSTD_FOR_EACH_ROWLOG(GEN_ZSTD_CALL_ROW_SEARCH_FN, dictMode, mls) \ 1445*3117ece4Schristos } \ 1446*3117ece4Schristos ZSTD_UNREACHABLE; \ 1447*3117ece4Schristos break; 1448*3117ece4Schristos 1449*3117ece4Schristos #define ZSTD_SWITCH_SEARCH_METHOD(dictMode) \ 1450*3117ece4Schristos switch (searchMethod) { \ 1451*3117ece4Schristos case search_hashChain: \ 1452*3117ece4Schristos ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_HC_SEARCH_FN, dictMode) \ 1453*3117ece4Schristos break; \ 1454*3117ece4Schristos case search_binaryTree: \ 1455*3117ece4Schristos ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_BT_SEARCH_FN, dictMode) \ 1456*3117ece4Schristos break; \ 1457*3117ece4Schristos case search_rowHash: \ 1458*3117ece4Schristos ZSTD_SWITCH_MLS(ZSTD_SWITCH_ROWLOG, dictMode) \ 1459*3117ece4Schristos break; \ 1460*3117ece4Schristos } \ 1461*3117ece4Schristos ZSTD_UNREACHABLE; 1462*3117ece4Schristos 1463*3117ece4Schristos /** 1464*3117ece4Schristos * Searches for the longest match at @p ip. 1465*3117ece4Schristos * Dispatches to the correct implementation function based on the 1466*3117ece4Schristos * (searchMethod, dictMode, mls, rowLog). We use switch statements 1467*3117ece4Schristos * here instead of using an indirect function call through a function 1468*3117ece4Schristos * pointer because after Spectre and Meltdown mitigations, indirect 1469*3117ece4Schristos * function calls can be very costly, especially in the kernel. 1470*3117ece4Schristos * 1471*3117ece4Schristos * NOTE: dictMode and searchMethod should be templated, so those switch 1472*3117ece4Schristos * statements should be optimized out. Only the mls & rowLog switches 1473*3117ece4Schristos * should be left. 1474*3117ece4Schristos * 1475*3117ece4Schristos * @param ms The match state. 1476*3117ece4Schristos * @param ip The position to search at. 1477*3117ece4Schristos * @param iend The end of the input data. 1478*3117ece4Schristos * @param[out] offsetPtr Stores the match offset into this pointer. 1479*3117ece4Schristos * @param mls The minimum search length, in the range [4, 6]. 1480*3117ece4Schristos * @param rowLog The row log (if applicable), in the range [4, 6]. 1481*3117ece4Schristos * @param searchMethod The search method to use (templated). 1482*3117ece4Schristos * @param dictMode The dictMode (templated). 1483*3117ece4Schristos * 1484*3117ece4Schristos * @returns The length of the longest match found, or < mls if no match is found. 1485*3117ece4Schristos * If a match is found its offset is stored in @p offsetPtr. 1486*3117ece4Schristos */ 1487*3117ece4Schristos FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax( 1488*3117ece4Schristos ZSTD_matchState_t* ms, 1489*3117ece4Schristos const BYTE* ip, 1490*3117ece4Schristos const BYTE* iend, 1491*3117ece4Schristos size_t* offsetPtr, 1492*3117ece4Schristos U32 const mls, 1493*3117ece4Schristos U32 const rowLog, 1494*3117ece4Schristos searchMethod_e const searchMethod, 1495*3117ece4Schristos ZSTD_dictMode_e const dictMode) 1496*3117ece4Schristos { 1497*3117ece4Schristos if (dictMode == ZSTD_noDict) { 1498*3117ece4Schristos ZSTD_SWITCH_SEARCH_METHOD(noDict) 1499*3117ece4Schristos } else if (dictMode == ZSTD_extDict) { 1500*3117ece4Schristos ZSTD_SWITCH_SEARCH_METHOD(extDict) 1501*3117ece4Schristos } else if (dictMode == ZSTD_dictMatchState) { 1502*3117ece4Schristos ZSTD_SWITCH_SEARCH_METHOD(dictMatchState) 1503*3117ece4Schristos } else if (dictMode == ZSTD_dedicatedDictSearch) { 1504*3117ece4Schristos ZSTD_SWITCH_SEARCH_METHOD(dedicatedDictSearch) 1505*3117ece4Schristos } 1506*3117ece4Schristos ZSTD_UNREACHABLE; 1507*3117ece4Schristos return 0; 1508*3117ece4Schristos } 1509*3117ece4Schristos 1510*3117ece4Schristos /* ******************************* 1511*3117ece4Schristos * Common parser - lazy strategy 1512*3117ece4Schristos *********************************/ 1513*3117ece4Schristos 1514*3117ece4Schristos FORCE_INLINE_TEMPLATE 1515*3117ece4Schristos ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 1516*3117ece4Schristos size_t ZSTD_compressBlock_lazy_generic( 1517*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, 1518*3117ece4Schristos U32 rep[ZSTD_REP_NUM], 1519*3117ece4Schristos const void* src, size_t srcSize, 1520*3117ece4Schristos const searchMethod_e searchMethod, const U32 depth, 1521*3117ece4Schristos ZSTD_dictMode_e const dictMode) 1522*3117ece4Schristos { 1523*3117ece4Schristos const BYTE* const istart = (const BYTE*)src; 1524*3117ece4Schristos const BYTE* ip = istart; 1525*3117ece4Schristos const BYTE* anchor = istart; 1526*3117ece4Schristos const BYTE* const iend = istart + srcSize; 1527*3117ece4Schristos const BYTE* const ilimit = (searchMethod == search_rowHash) ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8; 1528*3117ece4Schristos const BYTE* const base = ms->window.base; 1529*3117ece4Schristos const U32 prefixLowestIndex = ms->window.dictLimit; 1530*3117ece4Schristos const BYTE* const prefixLowest = base + prefixLowestIndex; 1531*3117ece4Schristos const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6); 1532*3117ece4Schristos const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); 1533*3117ece4Schristos 1534*3117ece4Schristos U32 offset_1 = rep[0], offset_2 = rep[1]; 1535*3117ece4Schristos U32 offsetSaved1 = 0, offsetSaved2 = 0; 1536*3117ece4Schristos 1537*3117ece4Schristos const int isDMS = dictMode == ZSTD_dictMatchState; 1538*3117ece4Schristos const int isDDS = dictMode == ZSTD_dedicatedDictSearch; 1539*3117ece4Schristos const int isDxS = isDMS || isDDS; 1540*3117ece4Schristos const ZSTD_matchState_t* const dms = ms->dictMatchState; 1541*3117ece4Schristos const U32 dictLowestIndex = isDxS ? dms->window.dictLimit : 0; 1542*3117ece4Schristos const BYTE* const dictBase = isDxS ? dms->window.base : NULL; 1543*3117ece4Schristos const BYTE* const dictLowest = isDxS ? dictBase + dictLowestIndex : NULL; 1544*3117ece4Schristos const BYTE* const dictEnd = isDxS ? dms->window.nextSrc : NULL; 1545*3117ece4Schristos const U32 dictIndexDelta = isDxS ? 1546*3117ece4Schristos prefixLowestIndex - (U32)(dictEnd - dictBase) : 1547*3117ece4Schristos 0; 1548*3117ece4Schristos const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest)); 1549*3117ece4Schristos 1550*3117ece4Schristos DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod); 1551*3117ece4Schristos ip += (dictAndPrefixLength == 0); 1552*3117ece4Schristos if (dictMode == ZSTD_noDict) { 1553*3117ece4Schristos U32 const curr = (U32)(ip - base); 1554*3117ece4Schristos U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog); 1555*3117ece4Schristos U32 const maxRep = curr - windowLow; 1556*3117ece4Schristos if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0; 1557*3117ece4Schristos if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0; 1558*3117ece4Schristos } 1559*3117ece4Schristos if (isDxS) { 1560*3117ece4Schristos /* dictMatchState repCode checks don't currently handle repCode == 0 1561*3117ece4Schristos * disabling. */ 1562*3117ece4Schristos assert(offset_1 <= dictAndPrefixLength); 1563*3117ece4Schristos assert(offset_2 <= dictAndPrefixLength); 1564*3117ece4Schristos } 1565*3117ece4Schristos 1566*3117ece4Schristos /* Reset the lazy skipping state */ 1567*3117ece4Schristos ms->lazySkipping = 0; 1568*3117ece4Schristos 1569*3117ece4Schristos if (searchMethod == search_rowHash) { 1570*3117ece4Schristos ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); 1571*3117ece4Schristos } 1572*3117ece4Schristos 1573*3117ece4Schristos /* Match Loop */ 1574*3117ece4Schristos #if defined(__GNUC__) && defined(__x86_64__) 1575*3117ece4Schristos /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the 1576*3117ece4Schristos * code alignment is perturbed. To fix the instability align the loop on 32-bytes. 1577*3117ece4Schristos */ 1578*3117ece4Schristos __asm__(".p2align 5"); 1579*3117ece4Schristos #endif 1580*3117ece4Schristos while (ip < ilimit) { 1581*3117ece4Schristos size_t matchLength=0; 1582*3117ece4Schristos size_t offBase = REPCODE1_TO_OFFBASE; 1583*3117ece4Schristos const BYTE* start=ip+1; 1584*3117ece4Schristos DEBUGLOG(7, "search baseline (depth 0)"); 1585*3117ece4Schristos 1586*3117ece4Schristos /* check repCode */ 1587*3117ece4Schristos if (isDxS) { 1588*3117ece4Schristos const U32 repIndex = (U32)(ip - base) + 1 - offset_1; 1589*3117ece4Schristos const BYTE* repMatch = ((dictMode == ZSTD_dictMatchState || dictMode == ZSTD_dedicatedDictSearch) 1590*3117ece4Schristos && repIndex < prefixLowestIndex) ? 1591*3117ece4Schristos dictBase + (repIndex - dictIndexDelta) : 1592*3117ece4Schristos base + repIndex; 1593*3117ece4Schristos if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) 1594*3117ece4Schristos && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { 1595*3117ece4Schristos const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; 1596*3117ece4Schristos matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; 1597*3117ece4Schristos if (depth==0) goto _storeSequence; 1598*3117ece4Schristos } 1599*3117ece4Schristos } 1600*3117ece4Schristos if ( dictMode == ZSTD_noDict 1601*3117ece4Schristos && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) { 1602*3117ece4Schristos matchLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; 1603*3117ece4Schristos if (depth==0) goto _storeSequence; 1604*3117ece4Schristos } 1605*3117ece4Schristos 1606*3117ece4Schristos /* first search (depth 0) */ 1607*3117ece4Schristos { size_t offbaseFound = 999999999; 1608*3117ece4Schristos size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode); 1609*3117ece4Schristos if (ml2 > matchLength) 1610*3117ece4Schristos matchLength = ml2, start = ip, offBase = offbaseFound; 1611*3117ece4Schristos } 1612*3117ece4Schristos 1613*3117ece4Schristos if (matchLength < 4) { 1614*3117ece4Schristos size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */; 1615*3117ece4Schristos ip += step; 1616*3117ece4Schristos /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time. 1617*3117ece4Schristos * In this mode we stop inserting every position into our tables, and only insert 1618*3117ece4Schristos * positions that we search, which is one in step positions. 1619*3117ece4Schristos * The exact cutoff is flexible, I've just chosen a number that is reasonably high, 1620*3117ece4Schristos * so we minimize the compression ratio loss in "normal" scenarios. This mode gets 1621*3117ece4Schristos * triggered once we've gone 2KB without finding any matches. 1622*3117ece4Schristos */ 1623*3117ece4Schristos ms->lazySkipping = step > kLazySkippingStep; 1624*3117ece4Schristos continue; 1625*3117ece4Schristos } 1626*3117ece4Schristos 1627*3117ece4Schristos /* let's try to find a better solution */ 1628*3117ece4Schristos if (depth>=1) 1629*3117ece4Schristos while (ip<ilimit) { 1630*3117ece4Schristos DEBUGLOG(7, "search depth 1"); 1631*3117ece4Schristos ip ++; 1632*3117ece4Schristos if ( (dictMode == ZSTD_noDict) 1633*3117ece4Schristos && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { 1634*3117ece4Schristos size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; 1635*3117ece4Schristos int const gain2 = (int)(mlRep * 3); 1636*3117ece4Schristos int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); 1637*3117ece4Schristos if ((mlRep >= 4) && (gain2 > gain1)) 1638*3117ece4Schristos matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; 1639*3117ece4Schristos } 1640*3117ece4Schristos if (isDxS) { 1641*3117ece4Schristos const U32 repIndex = (U32)(ip - base) - offset_1; 1642*3117ece4Schristos const BYTE* repMatch = repIndex < prefixLowestIndex ? 1643*3117ece4Schristos dictBase + (repIndex - dictIndexDelta) : 1644*3117ece4Schristos base + repIndex; 1645*3117ece4Schristos if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) 1646*3117ece4Schristos && (MEM_read32(repMatch) == MEM_read32(ip)) ) { 1647*3117ece4Schristos const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; 1648*3117ece4Schristos size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; 1649*3117ece4Schristos int const gain2 = (int)(mlRep * 3); 1650*3117ece4Schristos int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); 1651*3117ece4Schristos if ((mlRep >= 4) && (gain2 > gain1)) 1652*3117ece4Schristos matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; 1653*3117ece4Schristos } 1654*3117ece4Schristos } 1655*3117ece4Schristos { size_t ofbCandidate=999999999; 1656*3117ece4Schristos size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode); 1657*3117ece4Schristos int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ 1658*3117ece4Schristos int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4); 1659*3117ece4Schristos if ((ml2 >= 4) && (gain2 > gain1)) { 1660*3117ece4Schristos matchLength = ml2, offBase = ofbCandidate, start = ip; 1661*3117ece4Schristos continue; /* search a better one */ 1662*3117ece4Schristos } } 1663*3117ece4Schristos 1664*3117ece4Schristos /* let's find an even better one */ 1665*3117ece4Schristos if ((depth==2) && (ip<ilimit)) { 1666*3117ece4Schristos DEBUGLOG(7, "search depth 2"); 1667*3117ece4Schristos ip ++; 1668*3117ece4Schristos if ( (dictMode == ZSTD_noDict) 1669*3117ece4Schristos && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { 1670*3117ece4Schristos size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; 1671*3117ece4Schristos int const gain2 = (int)(mlRep * 4); 1672*3117ece4Schristos int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); 1673*3117ece4Schristos if ((mlRep >= 4) && (gain2 > gain1)) 1674*3117ece4Schristos matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; 1675*3117ece4Schristos } 1676*3117ece4Schristos if (isDxS) { 1677*3117ece4Schristos const U32 repIndex = (U32)(ip - base) - offset_1; 1678*3117ece4Schristos const BYTE* repMatch = repIndex < prefixLowestIndex ? 1679*3117ece4Schristos dictBase + (repIndex - dictIndexDelta) : 1680*3117ece4Schristos base + repIndex; 1681*3117ece4Schristos if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) 1682*3117ece4Schristos && (MEM_read32(repMatch) == MEM_read32(ip)) ) { 1683*3117ece4Schristos const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; 1684*3117ece4Schristos size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; 1685*3117ece4Schristos int const gain2 = (int)(mlRep * 4); 1686*3117ece4Schristos int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); 1687*3117ece4Schristos if ((mlRep >= 4) && (gain2 > gain1)) 1688*3117ece4Schristos matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; 1689*3117ece4Schristos } 1690*3117ece4Schristos } 1691*3117ece4Schristos { size_t ofbCandidate=999999999; 1692*3117ece4Schristos size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode); 1693*3117ece4Schristos int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ 1694*3117ece4Schristos int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7); 1695*3117ece4Schristos if ((ml2 >= 4) && (gain2 > gain1)) { 1696*3117ece4Schristos matchLength = ml2, offBase = ofbCandidate, start = ip; 1697*3117ece4Schristos continue; 1698*3117ece4Schristos } } } 1699*3117ece4Schristos break; /* nothing found : store previous solution */ 1700*3117ece4Schristos } 1701*3117ece4Schristos 1702*3117ece4Schristos /* NOTE: 1703*3117ece4Schristos * Pay attention that `start[-value]` can lead to strange undefined behavior 1704*3117ece4Schristos * notably if `value` is unsigned, resulting in a large positive `-value`. 1705*3117ece4Schristos */ 1706*3117ece4Schristos /* catch up */ 1707*3117ece4Schristos if (OFFBASE_IS_OFFSET(offBase)) { 1708*3117ece4Schristos if (dictMode == ZSTD_noDict) { 1709*3117ece4Schristos while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest)) 1710*3117ece4Schristos && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) ) /* only search for offset within prefix */ 1711*3117ece4Schristos { start--; matchLength++; } 1712*3117ece4Schristos } 1713*3117ece4Schristos if (isDxS) { 1714*3117ece4Schristos U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase)); 1715*3117ece4Schristos const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex; 1716*3117ece4Schristos const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest; 1717*3117ece4Schristos while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ 1718*3117ece4Schristos } 1719*3117ece4Schristos offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase); 1720*3117ece4Schristos } 1721*3117ece4Schristos /* store sequence */ 1722*3117ece4Schristos _storeSequence: 1723*3117ece4Schristos { size_t const litLength = (size_t)(start - anchor); 1724*3117ece4Schristos ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength); 1725*3117ece4Schristos anchor = ip = start + matchLength; 1726*3117ece4Schristos } 1727*3117ece4Schristos if (ms->lazySkipping) { 1728*3117ece4Schristos /* We've found a match, disable lazy skipping mode, and refill the hash cache. */ 1729*3117ece4Schristos if (searchMethod == search_rowHash) { 1730*3117ece4Schristos ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); 1731*3117ece4Schristos } 1732*3117ece4Schristos ms->lazySkipping = 0; 1733*3117ece4Schristos } 1734*3117ece4Schristos 1735*3117ece4Schristos /* check immediate repcode */ 1736*3117ece4Schristos if (isDxS) { 1737*3117ece4Schristos while (ip <= ilimit) { 1738*3117ece4Schristos U32 const current2 = (U32)(ip-base); 1739*3117ece4Schristos U32 const repIndex = current2 - offset_2; 1740*3117ece4Schristos const BYTE* repMatch = repIndex < prefixLowestIndex ? 1741*3117ece4Schristos dictBase - dictIndexDelta + repIndex : 1742*3117ece4Schristos base + repIndex; 1743*3117ece4Schristos if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex) >= 3 /* intentional overflow */) 1744*3117ece4Schristos && (MEM_read32(repMatch) == MEM_read32(ip)) ) { 1745*3117ece4Schristos const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend; 1746*3117ece4Schristos matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4; 1747*3117ece4Schristos offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset_2 <=> offset_1 */ 1748*3117ece4Schristos ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); 1749*3117ece4Schristos ip += matchLength; 1750*3117ece4Schristos anchor = ip; 1751*3117ece4Schristos continue; 1752*3117ece4Schristos } 1753*3117ece4Schristos break; 1754*3117ece4Schristos } 1755*3117ece4Schristos } 1756*3117ece4Schristos 1757*3117ece4Schristos if (dictMode == ZSTD_noDict) { 1758*3117ece4Schristos while ( ((ip <= ilimit) & (offset_2>0)) 1759*3117ece4Schristos && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) { 1760*3117ece4Schristos /* store sequence */ 1761*3117ece4Schristos matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; 1762*3117ece4Schristos offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */ 1763*3117ece4Schristos ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); 1764*3117ece4Schristos ip += matchLength; 1765*3117ece4Schristos anchor = ip; 1766*3117ece4Schristos continue; /* faster when present ... (?) */ 1767*3117ece4Schristos } } } 1768*3117ece4Schristos 1769*3117ece4Schristos /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), 1770*3117ece4Schristos * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ 1771*3117ece4Schristos offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; 1772*3117ece4Schristos 1773*3117ece4Schristos /* save reps for next block */ 1774*3117ece4Schristos rep[0] = offset_1 ? offset_1 : offsetSaved1; 1775*3117ece4Schristos rep[1] = offset_2 ? offset_2 : offsetSaved2; 1776*3117ece4Schristos 1777*3117ece4Schristos /* Return the last literals size */ 1778*3117ece4Schristos return (size_t)(iend - anchor); 1779*3117ece4Schristos } 1780*3117ece4Schristos #endif /* build exclusions */ 1781*3117ece4Schristos 1782*3117ece4Schristos 1783*3117ece4Schristos #ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR 1784*3117ece4Schristos size_t ZSTD_compressBlock_greedy( 1785*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1786*3117ece4Schristos void const* src, size_t srcSize) 1787*3117ece4Schristos { 1788*3117ece4Schristos return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict); 1789*3117ece4Schristos } 1790*3117ece4Schristos 1791*3117ece4Schristos size_t ZSTD_compressBlock_greedy_dictMatchState( 1792*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1793*3117ece4Schristos void const* src, size_t srcSize) 1794*3117ece4Schristos { 1795*3117ece4Schristos return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState); 1796*3117ece4Schristos } 1797*3117ece4Schristos 1798*3117ece4Schristos size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( 1799*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1800*3117ece4Schristos void const* src, size_t srcSize) 1801*3117ece4Schristos { 1802*3117ece4Schristos return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch); 1803*3117ece4Schristos } 1804*3117ece4Schristos 1805*3117ece4Schristos size_t ZSTD_compressBlock_greedy_row( 1806*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1807*3117ece4Schristos void const* src, size_t srcSize) 1808*3117ece4Schristos { 1809*3117ece4Schristos return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict); 1810*3117ece4Schristos } 1811*3117ece4Schristos 1812*3117ece4Schristos size_t ZSTD_compressBlock_greedy_dictMatchState_row( 1813*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1814*3117ece4Schristos void const* src, size_t srcSize) 1815*3117ece4Schristos { 1816*3117ece4Schristos return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState); 1817*3117ece4Schristos } 1818*3117ece4Schristos 1819*3117ece4Schristos size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( 1820*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1821*3117ece4Schristos void const* src, size_t srcSize) 1822*3117ece4Schristos { 1823*3117ece4Schristos return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch); 1824*3117ece4Schristos } 1825*3117ece4Schristos #endif 1826*3117ece4Schristos 1827*3117ece4Schristos #ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR 1828*3117ece4Schristos size_t ZSTD_compressBlock_lazy( 1829*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1830*3117ece4Schristos void const* src, size_t srcSize) 1831*3117ece4Schristos { 1832*3117ece4Schristos return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict); 1833*3117ece4Schristos } 1834*3117ece4Schristos 1835*3117ece4Schristos size_t ZSTD_compressBlock_lazy_dictMatchState( 1836*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1837*3117ece4Schristos void const* src, size_t srcSize) 1838*3117ece4Schristos { 1839*3117ece4Schristos return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState); 1840*3117ece4Schristos } 1841*3117ece4Schristos 1842*3117ece4Schristos size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( 1843*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1844*3117ece4Schristos void const* src, size_t srcSize) 1845*3117ece4Schristos { 1846*3117ece4Schristos return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch); 1847*3117ece4Schristos } 1848*3117ece4Schristos 1849*3117ece4Schristos size_t ZSTD_compressBlock_lazy_row( 1850*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1851*3117ece4Schristos void const* src, size_t srcSize) 1852*3117ece4Schristos { 1853*3117ece4Schristos return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict); 1854*3117ece4Schristos } 1855*3117ece4Schristos 1856*3117ece4Schristos size_t ZSTD_compressBlock_lazy_dictMatchState_row( 1857*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1858*3117ece4Schristos void const* src, size_t srcSize) 1859*3117ece4Schristos { 1860*3117ece4Schristos return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState); 1861*3117ece4Schristos } 1862*3117ece4Schristos 1863*3117ece4Schristos size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( 1864*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1865*3117ece4Schristos void const* src, size_t srcSize) 1866*3117ece4Schristos { 1867*3117ece4Schristos return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch); 1868*3117ece4Schristos } 1869*3117ece4Schristos #endif 1870*3117ece4Schristos 1871*3117ece4Schristos #ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR 1872*3117ece4Schristos size_t ZSTD_compressBlock_lazy2( 1873*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1874*3117ece4Schristos void const* src, size_t srcSize) 1875*3117ece4Schristos { 1876*3117ece4Schristos return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict); 1877*3117ece4Schristos } 1878*3117ece4Schristos 1879*3117ece4Schristos size_t ZSTD_compressBlock_lazy2_dictMatchState( 1880*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1881*3117ece4Schristos void const* src, size_t srcSize) 1882*3117ece4Schristos { 1883*3117ece4Schristos return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState); 1884*3117ece4Schristos } 1885*3117ece4Schristos 1886*3117ece4Schristos size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( 1887*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1888*3117ece4Schristos void const* src, size_t srcSize) 1889*3117ece4Schristos { 1890*3117ece4Schristos return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch); 1891*3117ece4Schristos } 1892*3117ece4Schristos 1893*3117ece4Schristos size_t ZSTD_compressBlock_lazy2_row( 1894*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1895*3117ece4Schristos void const* src, size_t srcSize) 1896*3117ece4Schristos { 1897*3117ece4Schristos return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict); 1898*3117ece4Schristos } 1899*3117ece4Schristos 1900*3117ece4Schristos size_t ZSTD_compressBlock_lazy2_dictMatchState_row( 1901*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1902*3117ece4Schristos void const* src, size_t srcSize) 1903*3117ece4Schristos { 1904*3117ece4Schristos return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState); 1905*3117ece4Schristos } 1906*3117ece4Schristos 1907*3117ece4Schristos size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( 1908*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1909*3117ece4Schristos void const* src, size_t srcSize) 1910*3117ece4Schristos { 1911*3117ece4Schristos return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch); 1912*3117ece4Schristos } 1913*3117ece4Schristos #endif 1914*3117ece4Schristos 1915*3117ece4Schristos #ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR 1916*3117ece4Schristos size_t ZSTD_compressBlock_btlazy2( 1917*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1918*3117ece4Schristos void const* src, size_t srcSize) 1919*3117ece4Schristos { 1920*3117ece4Schristos return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict); 1921*3117ece4Schristos } 1922*3117ece4Schristos 1923*3117ece4Schristos size_t ZSTD_compressBlock_btlazy2_dictMatchState( 1924*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1925*3117ece4Schristos void const* src, size_t srcSize) 1926*3117ece4Schristos { 1927*3117ece4Schristos return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState); 1928*3117ece4Schristos } 1929*3117ece4Schristos #endif 1930*3117ece4Schristos 1931*3117ece4Schristos #if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ 1932*3117ece4Schristos || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ 1933*3117ece4Schristos || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ 1934*3117ece4Schristos || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) 1935*3117ece4Schristos FORCE_INLINE_TEMPLATE 1936*3117ece4Schristos ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 1937*3117ece4Schristos size_t ZSTD_compressBlock_lazy_extDict_generic( 1938*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, 1939*3117ece4Schristos U32 rep[ZSTD_REP_NUM], 1940*3117ece4Schristos const void* src, size_t srcSize, 1941*3117ece4Schristos const searchMethod_e searchMethod, const U32 depth) 1942*3117ece4Schristos { 1943*3117ece4Schristos const BYTE* const istart = (const BYTE*)src; 1944*3117ece4Schristos const BYTE* ip = istart; 1945*3117ece4Schristos const BYTE* anchor = istart; 1946*3117ece4Schristos const BYTE* const iend = istart + srcSize; 1947*3117ece4Schristos const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8; 1948*3117ece4Schristos const BYTE* const base = ms->window.base; 1949*3117ece4Schristos const U32 dictLimit = ms->window.dictLimit; 1950*3117ece4Schristos const BYTE* const prefixStart = base + dictLimit; 1951*3117ece4Schristos const BYTE* const dictBase = ms->window.dictBase; 1952*3117ece4Schristos const BYTE* const dictEnd = dictBase + dictLimit; 1953*3117ece4Schristos const BYTE* const dictStart = dictBase + ms->window.lowLimit; 1954*3117ece4Schristos const U32 windowLog = ms->cParams.windowLog; 1955*3117ece4Schristos const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6); 1956*3117ece4Schristos const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); 1957*3117ece4Schristos 1958*3117ece4Schristos U32 offset_1 = rep[0], offset_2 = rep[1]; 1959*3117ece4Schristos 1960*3117ece4Schristos DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod); 1961*3117ece4Schristos 1962*3117ece4Schristos /* Reset the lazy skipping state */ 1963*3117ece4Schristos ms->lazySkipping = 0; 1964*3117ece4Schristos 1965*3117ece4Schristos /* init */ 1966*3117ece4Schristos ip += (ip == prefixStart); 1967*3117ece4Schristos if (searchMethod == search_rowHash) { 1968*3117ece4Schristos ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); 1969*3117ece4Schristos } 1970*3117ece4Schristos 1971*3117ece4Schristos /* Match Loop */ 1972*3117ece4Schristos #if defined(__GNUC__) && defined(__x86_64__) 1973*3117ece4Schristos /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the 1974*3117ece4Schristos * code alignment is perturbed. To fix the instability align the loop on 32-bytes. 1975*3117ece4Schristos */ 1976*3117ece4Schristos __asm__(".p2align 5"); 1977*3117ece4Schristos #endif 1978*3117ece4Schristos while (ip < ilimit) { 1979*3117ece4Schristos size_t matchLength=0; 1980*3117ece4Schristos size_t offBase = REPCODE1_TO_OFFBASE; 1981*3117ece4Schristos const BYTE* start=ip+1; 1982*3117ece4Schristos U32 curr = (U32)(ip-base); 1983*3117ece4Schristos 1984*3117ece4Schristos /* check repCode */ 1985*3117ece4Schristos { const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr+1, windowLog); 1986*3117ece4Schristos const U32 repIndex = (U32)(curr+1 - offset_1); 1987*3117ece4Schristos const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; 1988*3117ece4Schristos const BYTE* const repMatch = repBase + repIndex; 1989*3117ece4Schristos if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */ 1990*3117ece4Schristos & (offset_1 <= curr+1 - windowLow) ) /* note: we are searching at curr+1 */ 1991*3117ece4Schristos if (MEM_read32(ip+1) == MEM_read32(repMatch)) { 1992*3117ece4Schristos /* repcode detected we should take it */ 1993*3117ece4Schristos const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; 1994*3117ece4Schristos matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repEnd, prefixStart) + 4; 1995*3117ece4Schristos if (depth==0) goto _storeSequence; 1996*3117ece4Schristos } } 1997*3117ece4Schristos 1998*3117ece4Schristos /* first search (depth 0) */ 1999*3117ece4Schristos { size_t ofbCandidate = 999999999; 2000*3117ece4Schristos size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); 2001*3117ece4Schristos if (ml2 > matchLength) 2002*3117ece4Schristos matchLength = ml2, start = ip, offBase = ofbCandidate; 2003*3117ece4Schristos } 2004*3117ece4Schristos 2005*3117ece4Schristos if (matchLength < 4) { 2006*3117ece4Schristos size_t const step = ((size_t)(ip-anchor) >> kSearchStrength); 2007*3117ece4Schristos ip += step + 1; /* jump faster over incompressible sections */ 2008*3117ece4Schristos /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time. 2009*3117ece4Schristos * In this mode we stop inserting every position into our tables, and only insert 2010*3117ece4Schristos * positions that we search, which is one in step positions. 2011*3117ece4Schristos * The exact cutoff is flexible, I've just chosen a number that is reasonably high, 2012*3117ece4Schristos * so we minimize the compression ratio loss in "normal" scenarios. This mode gets 2013*3117ece4Schristos * triggered once we've gone 2KB without finding any matches. 2014*3117ece4Schristos */ 2015*3117ece4Schristos ms->lazySkipping = step > kLazySkippingStep; 2016*3117ece4Schristos continue; 2017*3117ece4Schristos } 2018*3117ece4Schristos 2019*3117ece4Schristos /* let's try to find a better solution */ 2020*3117ece4Schristos if (depth>=1) 2021*3117ece4Schristos while (ip<ilimit) { 2022*3117ece4Schristos ip ++; 2023*3117ece4Schristos curr++; 2024*3117ece4Schristos /* check repCode */ 2025*3117ece4Schristos if (offBase) { 2026*3117ece4Schristos const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); 2027*3117ece4Schristos const U32 repIndex = (U32)(curr - offset_1); 2028*3117ece4Schristos const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; 2029*3117ece4Schristos const BYTE* const repMatch = repBase + repIndex; 2030*3117ece4Schristos if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */ 2031*3117ece4Schristos & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ 2032*3117ece4Schristos if (MEM_read32(ip) == MEM_read32(repMatch)) { 2033*3117ece4Schristos /* repcode detected */ 2034*3117ece4Schristos const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; 2035*3117ece4Schristos size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; 2036*3117ece4Schristos int const gain2 = (int)(repLength * 3); 2037*3117ece4Schristos int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); 2038*3117ece4Schristos if ((repLength >= 4) && (gain2 > gain1)) 2039*3117ece4Schristos matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip; 2040*3117ece4Schristos } } 2041*3117ece4Schristos 2042*3117ece4Schristos /* search match, depth 1 */ 2043*3117ece4Schristos { size_t ofbCandidate = 999999999; 2044*3117ece4Schristos size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); 2045*3117ece4Schristos int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ 2046*3117ece4Schristos int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4); 2047*3117ece4Schristos if ((ml2 >= 4) && (gain2 > gain1)) { 2048*3117ece4Schristos matchLength = ml2, offBase = ofbCandidate, start = ip; 2049*3117ece4Schristos continue; /* search a better one */ 2050*3117ece4Schristos } } 2051*3117ece4Schristos 2052*3117ece4Schristos /* let's find an even better one */ 2053*3117ece4Schristos if ((depth==2) && (ip<ilimit)) { 2054*3117ece4Schristos ip ++; 2055*3117ece4Schristos curr++; 2056*3117ece4Schristos /* check repCode */ 2057*3117ece4Schristos if (offBase) { 2058*3117ece4Schristos const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); 2059*3117ece4Schristos const U32 repIndex = (U32)(curr - offset_1); 2060*3117ece4Schristos const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; 2061*3117ece4Schristos const BYTE* const repMatch = repBase + repIndex; 2062*3117ece4Schristos if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */ 2063*3117ece4Schristos & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ 2064*3117ece4Schristos if (MEM_read32(ip) == MEM_read32(repMatch)) { 2065*3117ece4Schristos /* repcode detected */ 2066*3117ece4Schristos const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; 2067*3117ece4Schristos size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; 2068*3117ece4Schristos int const gain2 = (int)(repLength * 4); 2069*3117ece4Schristos int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); 2070*3117ece4Schristos if ((repLength >= 4) && (gain2 > gain1)) 2071*3117ece4Schristos matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip; 2072*3117ece4Schristos } } 2073*3117ece4Schristos 2074*3117ece4Schristos /* search match, depth 2 */ 2075*3117ece4Schristos { size_t ofbCandidate = 999999999; 2076*3117ece4Schristos size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); 2077*3117ece4Schristos int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ 2078*3117ece4Schristos int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7); 2079*3117ece4Schristos if ((ml2 >= 4) && (gain2 > gain1)) { 2080*3117ece4Schristos matchLength = ml2, offBase = ofbCandidate, start = ip; 2081*3117ece4Schristos continue; 2082*3117ece4Schristos } } } 2083*3117ece4Schristos break; /* nothing found : store previous solution */ 2084*3117ece4Schristos } 2085*3117ece4Schristos 2086*3117ece4Schristos /* catch up */ 2087*3117ece4Schristos if (OFFBASE_IS_OFFSET(offBase)) { 2088*3117ece4Schristos U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase)); 2089*3117ece4Schristos const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex; 2090*3117ece4Schristos const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart; 2091*3117ece4Schristos while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ 2092*3117ece4Schristos offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase); 2093*3117ece4Schristos } 2094*3117ece4Schristos 2095*3117ece4Schristos /* store sequence */ 2096*3117ece4Schristos _storeSequence: 2097*3117ece4Schristos { size_t const litLength = (size_t)(start - anchor); 2098*3117ece4Schristos ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength); 2099*3117ece4Schristos anchor = ip = start + matchLength; 2100*3117ece4Schristos } 2101*3117ece4Schristos if (ms->lazySkipping) { 2102*3117ece4Schristos /* We've found a match, disable lazy skipping mode, and refill the hash cache. */ 2103*3117ece4Schristos if (searchMethod == search_rowHash) { 2104*3117ece4Schristos ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); 2105*3117ece4Schristos } 2106*3117ece4Schristos ms->lazySkipping = 0; 2107*3117ece4Schristos } 2108*3117ece4Schristos 2109*3117ece4Schristos /* check immediate repcode */ 2110*3117ece4Schristos while (ip <= ilimit) { 2111*3117ece4Schristos const U32 repCurrent = (U32)(ip-base); 2112*3117ece4Schristos const U32 windowLow = ZSTD_getLowestMatchIndex(ms, repCurrent, windowLog); 2113*3117ece4Schristos const U32 repIndex = repCurrent - offset_2; 2114*3117ece4Schristos const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; 2115*3117ece4Schristos const BYTE* const repMatch = repBase + repIndex; 2116*3117ece4Schristos if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */ 2117*3117ece4Schristos & (offset_2 <= repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ 2118*3117ece4Schristos if (MEM_read32(ip) == MEM_read32(repMatch)) { 2119*3117ece4Schristos /* repcode detected we should take it */ 2120*3117ece4Schristos const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; 2121*3117ece4Schristos matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; 2122*3117ece4Schristos offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset history */ 2123*3117ece4Schristos ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); 2124*3117ece4Schristos ip += matchLength; 2125*3117ece4Schristos anchor = ip; 2126*3117ece4Schristos continue; /* faster when present ... (?) */ 2127*3117ece4Schristos } 2128*3117ece4Schristos break; 2129*3117ece4Schristos } } 2130*3117ece4Schristos 2131*3117ece4Schristos /* Save reps for next block */ 2132*3117ece4Schristos rep[0] = offset_1; 2133*3117ece4Schristos rep[1] = offset_2; 2134*3117ece4Schristos 2135*3117ece4Schristos /* Return the last literals size */ 2136*3117ece4Schristos return (size_t)(iend - anchor); 2137*3117ece4Schristos } 2138*3117ece4Schristos #endif /* build exclusions */ 2139*3117ece4Schristos 2140*3117ece4Schristos #ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR 2141*3117ece4Schristos size_t ZSTD_compressBlock_greedy_extDict( 2142*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 2143*3117ece4Schristos void const* src, size_t srcSize) 2144*3117ece4Schristos { 2145*3117ece4Schristos return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0); 2146*3117ece4Schristos } 2147*3117ece4Schristos 2148*3117ece4Schristos size_t ZSTD_compressBlock_greedy_extDict_row( 2149*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 2150*3117ece4Schristos void const* src, size_t srcSize) 2151*3117ece4Schristos { 2152*3117ece4Schristos return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0); 2153*3117ece4Schristos } 2154*3117ece4Schristos #endif 2155*3117ece4Schristos 2156*3117ece4Schristos #ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR 2157*3117ece4Schristos size_t ZSTD_compressBlock_lazy_extDict( 2158*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 2159*3117ece4Schristos void const* src, size_t srcSize) 2160*3117ece4Schristos 2161*3117ece4Schristos { 2162*3117ece4Schristos return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1); 2163*3117ece4Schristos } 2164*3117ece4Schristos 2165*3117ece4Schristos size_t ZSTD_compressBlock_lazy_extDict_row( 2166*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 2167*3117ece4Schristos void const* src, size_t srcSize) 2168*3117ece4Schristos 2169*3117ece4Schristos { 2170*3117ece4Schristos return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1); 2171*3117ece4Schristos } 2172*3117ece4Schristos #endif 2173*3117ece4Schristos 2174*3117ece4Schristos #ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR 2175*3117ece4Schristos size_t ZSTD_compressBlock_lazy2_extDict( 2176*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 2177*3117ece4Schristos void const* src, size_t srcSize) 2178*3117ece4Schristos 2179*3117ece4Schristos { 2180*3117ece4Schristos return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2); 2181*3117ece4Schristos } 2182*3117ece4Schristos 2183*3117ece4Schristos size_t ZSTD_compressBlock_lazy2_extDict_row( 2184*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 2185*3117ece4Schristos void const* src, size_t srcSize) 2186*3117ece4Schristos { 2187*3117ece4Schristos return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2); 2188*3117ece4Schristos } 2189*3117ece4Schristos #endif 2190*3117ece4Schristos 2191*3117ece4Schristos #ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR 2192*3117ece4Schristos size_t ZSTD_compressBlock_btlazy2_extDict( 2193*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 2194*3117ece4Schristos void const* src, size_t srcSize) 2195*3117ece4Schristos 2196*3117ece4Schristos { 2197*3117ece4Schristos return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2); 2198*3117ece4Schristos } 2199*3117ece4Schristos #endif 2200