1*3117ece4Schristos /* 2*3117ece4Schristos * Copyright (c) Meta Platforms, Inc. and affiliates. 3*3117ece4Schristos * All rights reserved. 4*3117ece4Schristos * 5*3117ece4Schristos * This source code is licensed under both the BSD-style license (found in the 6*3117ece4Schristos * LICENSE file in the root directory of this source tree) and the GPLv2 (found 7*3117ece4Schristos * in the COPYING file in the root directory of this source tree). 8*3117ece4Schristos * You may select, at your option, one of the above-listed licenses. 9*3117ece4Schristos */ 10*3117ece4Schristos 11*3117ece4Schristos #include "zstd_compress_internal.h" /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */ 12*3117ece4Schristos #include "zstd_fast.h" 13*3117ece4Schristos 14*3117ece4Schristos static 15*3117ece4Schristos ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 16*3117ece4Schristos void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms, 17*3117ece4Schristos const void* const end, 18*3117ece4Schristos ZSTD_dictTableLoadMethod_e dtlm) 19*3117ece4Schristos { 20*3117ece4Schristos const ZSTD_compressionParameters* const cParams = &ms->cParams; 21*3117ece4Schristos U32* const hashTable = ms->hashTable; 22*3117ece4Schristos U32 const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; 23*3117ece4Schristos U32 const mls = cParams->minMatch; 24*3117ece4Schristos const BYTE* const base = ms->window.base; 25*3117ece4Schristos const BYTE* ip = base + ms->nextToUpdate; 26*3117ece4Schristos const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; 27*3117ece4Schristos const U32 fastHashFillStep = 3; 28*3117ece4Schristos 29*3117ece4Schristos /* Currently, we always use ZSTD_dtlm_full for filling CDict tables. 30*3117ece4Schristos * Feel free to remove this assert if there's a good reason! */ 31*3117ece4Schristos assert(dtlm == ZSTD_dtlm_full); 32*3117ece4Schristos 33*3117ece4Schristos /* Always insert every fastHashFillStep position into the hash table. 34*3117ece4Schristos * Insert the other positions if their hash entry is empty. 35*3117ece4Schristos */ 36*3117ece4Schristos for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) { 37*3117ece4Schristos U32 const curr = (U32)(ip - base); 38*3117ece4Schristos { size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls); 39*3117ece4Schristos ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr); } 40*3117ece4Schristos 41*3117ece4Schristos if (dtlm == ZSTD_dtlm_fast) continue; 42*3117ece4Schristos /* Only load extra positions for ZSTD_dtlm_full */ 43*3117ece4Schristos { U32 p; 44*3117ece4Schristos for (p = 1; p < fastHashFillStep; ++p) { 45*3117ece4Schristos size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls); 46*3117ece4Schristos if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) { /* not yet filled */ 47*3117ece4Schristos ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p); 48*3117ece4Schristos } } } } 49*3117ece4Schristos } 50*3117ece4Schristos 51*3117ece4Schristos static 52*3117ece4Schristos ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 53*3117ece4Schristos void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms, 54*3117ece4Schristos const void* const end, 55*3117ece4Schristos ZSTD_dictTableLoadMethod_e dtlm) 56*3117ece4Schristos { 57*3117ece4Schristos const ZSTD_compressionParameters* const cParams = &ms->cParams; 58*3117ece4Schristos U32* const hashTable = ms->hashTable; 59*3117ece4Schristos U32 const hBits = cParams->hashLog; 60*3117ece4Schristos U32 const mls = cParams->minMatch; 61*3117ece4Schristos const BYTE* const base = ms->window.base; 62*3117ece4Schristos const BYTE* ip = base + ms->nextToUpdate; 63*3117ece4Schristos const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; 64*3117ece4Schristos const U32 fastHashFillStep = 3; 65*3117ece4Schristos 66*3117ece4Schristos /* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables. 67*3117ece4Schristos * Feel free to remove this assert if there's a good reason! */ 68*3117ece4Schristos assert(dtlm == ZSTD_dtlm_fast); 69*3117ece4Schristos 70*3117ece4Schristos /* Always insert every fastHashFillStep position into the hash table. 71*3117ece4Schristos * Insert the other positions if their hash entry is empty. 72*3117ece4Schristos */ 73*3117ece4Schristos for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) { 74*3117ece4Schristos U32 const curr = (U32)(ip - base); 75*3117ece4Schristos size_t const hash0 = ZSTD_hashPtr(ip, hBits, mls); 76*3117ece4Schristos hashTable[hash0] = curr; 77*3117ece4Schristos if (dtlm == ZSTD_dtlm_fast) continue; 78*3117ece4Schristos /* Only load extra positions for ZSTD_dtlm_full */ 79*3117ece4Schristos { U32 p; 80*3117ece4Schristos for (p = 1; p < fastHashFillStep; ++p) { 81*3117ece4Schristos size_t const hash = ZSTD_hashPtr(ip + p, hBits, mls); 82*3117ece4Schristos if (hashTable[hash] == 0) { /* not yet filled */ 83*3117ece4Schristos hashTable[hash] = curr + p; 84*3117ece4Schristos } } } } 85*3117ece4Schristos } 86*3117ece4Schristos 87*3117ece4Schristos void ZSTD_fillHashTable(ZSTD_matchState_t* ms, 88*3117ece4Schristos const void* const end, 89*3117ece4Schristos ZSTD_dictTableLoadMethod_e dtlm, 90*3117ece4Schristos ZSTD_tableFillPurpose_e tfp) 91*3117ece4Schristos { 92*3117ece4Schristos if (tfp == ZSTD_tfp_forCDict) { 93*3117ece4Schristos ZSTD_fillHashTableForCDict(ms, end, dtlm); 94*3117ece4Schristos } else { 95*3117ece4Schristos ZSTD_fillHashTableForCCtx(ms, end, dtlm); 96*3117ece4Schristos } 97*3117ece4Schristos } 98*3117ece4Schristos 99*3117ece4Schristos 100*3117ece4Schristos /** 101*3117ece4Schristos * If you squint hard enough (and ignore repcodes), the search operation at any 102*3117ece4Schristos * given position is broken into 4 stages: 103*3117ece4Schristos * 104*3117ece4Schristos * 1. Hash (map position to hash value via input read) 105*3117ece4Schristos * 2. Lookup (map hash val to index via hashtable read) 106*3117ece4Schristos * 3. Load (map index to value at that position via input read) 107*3117ece4Schristos * 4. Compare 108*3117ece4Schristos * 109*3117ece4Schristos * Each of these steps involves a memory read at an address which is computed 110*3117ece4Schristos * from the previous step. This means these steps must be sequenced and their 111*3117ece4Schristos * latencies are cumulative. 112*3117ece4Schristos * 113*3117ece4Schristos * Rather than do 1->2->3->4 sequentially for a single position before moving 114*3117ece4Schristos * onto the next, this implementation interleaves these operations across the 115*3117ece4Schristos * next few positions: 116*3117ece4Schristos * 117*3117ece4Schristos * R = Repcode Read & Compare 118*3117ece4Schristos * H = Hash 119*3117ece4Schristos * T = Table Lookup 120*3117ece4Schristos * M = Match Read & Compare 121*3117ece4Schristos * 122*3117ece4Schristos * Pos | Time --> 123*3117ece4Schristos * ----+------------------- 124*3117ece4Schristos * N | ... M 125*3117ece4Schristos * N+1 | ... TM 126*3117ece4Schristos * N+2 | R H T M 127*3117ece4Schristos * N+3 | H TM 128*3117ece4Schristos * N+4 | R H T M 129*3117ece4Schristos * N+5 | H ... 130*3117ece4Schristos * N+6 | R ... 131*3117ece4Schristos * 132*3117ece4Schristos * This is very much analogous to the pipelining of execution in a CPU. And just 133*3117ece4Schristos * like a CPU, we have to dump the pipeline when we find a match (i.e., take a 134*3117ece4Schristos * branch). 135*3117ece4Schristos * 136*3117ece4Schristos * When this happens, we throw away our current state, and do the following prep 137*3117ece4Schristos * to re-enter the loop: 138*3117ece4Schristos * 139*3117ece4Schristos * Pos | Time --> 140*3117ece4Schristos * ----+------------------- 141*3117ece4Schristos * N | H T 142*3117ece4Schristos * N+1 | H 143*3117ece4Schristos * 144*3117ece4Schristos * This is also the work we do at the beginning to enter the loop initially. 145*3117ece4Schristos */ 146*3117ece4Schristos FORCE_INLINE_TEMPLATE 147*3117ece4Schristos ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 148*3117ece4Schristos size_t ZSTD_compressBlock_fast_noDict_generic( 149*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 150*3117ece4Schristos void const* src, size_t srcSize, 151*3117ece4Schristos U32 const mls, U32 const hasStep) 152*3117ece4Schristos { 153*3117ece4Schristos const ZSTD_compressionParameters* const cParams = &ms->cParams; 154*3117ece4Schristos U32* const hashTable = ms->hashTable; 155*3117ece4Schristos U32 const hlog = cParams->hashLog; 156*3117ece4Schristos /* support stepSize of 0 */ 157*3117ece4Schristos size_t const stepSize = hasStep ? (cParams->targetLength + !(cParams->targetLength) + 1) : 2; 158*3117ece4Schristos const BYTE* const base = ms->window.base; 159*3117ece4Schristos const BYTE* const istart = (const BYTE*)src; 160*3117ece4Schristos const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); 161*3117ece4Schristos const U32 prefixStartIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog); 162*3117ece4Schristos const BYTE* const prefixStart = base + prefixStartIndex; 163*3117ece4Schristos const BYTE* const iend = istart + srcSize; 164*3117ece4Schristos const BYTE* const ilimit = iend - HASH_READ_SIZE; 165*3117ece4Schristos 166*3117ece4Schristos const BYTE* anchor = istart; 167*3117ece4Schristos const BYTE* ip0 = istart; 168*3117ece4Schristos const BYTE* ip1; 169*3117ece4Schristos const BYTE* ip2; 170*3117ece4Schristos const BYTE* ip3; 171*3117ece4Schristos U32 current0; 172*3117ece4Schristos 173*3117ece4Schristos U32 rep_offset1 = rep[0]; 174*3117ece4Schristos U32 rep_offset2 = rep[1]; 175*3117ece4Schristos U32 offsetSaved1 = 0, offsetSaved2 = 0; 176*3117ece4Schristos 177*3117ece4Schristos size_t hash0; /* hash for ip0 */ 178*3117ece4Schristos size_t hash1; /* hash for ip1 */ 179*3117ece4Schristos U32 idx; /* match idx for ip0 */ 180*3117ece4Schristos U32 mval; /* src value at match idx */ 181*3117ece4Schristos 182*3117ece4Schristos U32 offcode; 183*3117ece4Schristos const BYTE* match0; 184*3117ece4Schristos size_t mLength; 185*3117ece4Schristos 186*3117ece4Schristos /* ip0 and ip1 are always adjacent. The targetLength skipping and 187*3117ece4Schristos * uncompressibility acceleration is applied to every other position, 188*3117ece4Schristos * matching the behavior of #1562. step therefore represents the gap 189*3117ece4Schristos * between pairs of positions, from ip0 to ip2 or ip1 to ip3. */ 190*3117ece4Schristos size_t step; 191*3117ece4Schristos const BYTE* nextStep; 192*3117ece4Schristos const size_t kStepIncr = (1 << (kSearchStrength - 1)); 193*3117ece4Schristos 194*3117ece4Schristos DEBUGLOG(5, "ZSTD_compressBlock_fast_generic"); 195*3117ece4Schristos ip0 += (ip0 == prefixStart); 196*3117ece4Schristos { U32 const curr = (U32)(ip0 - base); 197*3117ece4Schristos U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog); 198*3117ece4Schristos U32 const maxRep = curr - windowLow; 199*3117ece4Schristos if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = 0; 200*3117ece4Schristos if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = 0; 201*3117ece4Schristos } 202*3117ece4Schristos 203*3117ece4Schristos /* start each op */ 204*3117ece4Schristos _start: /* Requires: ip0 */ 205*3117ece4Schristos 206*3117ece4Schristos step = stepSize; 207*3117ece4Schristos nextStep = ip0 + kStepIncr; 208*3117ece4Schristos 209*3117ece4Schristos /* calculate positions, ip0 - anchor == 0, so we skip step calc */ 210*3117ece4Schristos ip1 = ip0 + 1; 211*3117ece4Schristos ip2 = ip0 + step; 212*3117ece4Schristos ip3 = ip2 + 1; 213*3117ece4Schristos 214*3117ece4Schristos if (ip3 >= ilimit) { 215*3117ece4Schristos goto _cleanup; 216*3117ece4Schristos } 217*3117ece4Schristos 218*3117ece4Schristos hash0 = ZSTD_hashPtr(ip0, hlog, mls); 219*3117ece4Schristos hash1 = ZSTD_hashPtr(ip1, hlog, mls); 220*3117ece4Schristos 221*3117ece4Schristos idx = hashTable[hash0]; 222*3117ece4Schristos 223*3117ece4Schristos do { 224*3117ece4Schristos /* load repcode match for ip[2]*/ 225*3117ece4Schristos const U32 rval = MEM_read32(ip2 - rep_offset1); 226*3117ece4Schristos 227*3117ece4Schristos /* write back hash table entry */ 228*3117ece4Schristos current0 = (U32)(ip0 - base); 229*3117ece4Schristos hashTable[hash0] = current0; 230*3117ece4Schristos 231*3117ece4Schristos /* check repcode at ip[2] */ 232*3117ece4Schristos if ((MEM_read32(ip2) == rval) & (rep_offset1 > 0)) { 233*3117ece4Schristos ip0 = ip2; 234*3117ece4Schristos match0 = ip0 - rep_offset1; 235*3117ece4Schristos mLength = ip0[-1] == match0[-1]; 236*3117ece4Schristos ip0 -= mLength; 237*3117ece4Schristos match0 -= mLength; 238*3117ece4Schristos offcode = REPCODE1_TO_OFFBASE; 239*3117ece4Schristos mLength += 4; 240*3117ece4Schristos 241*3117ece4Schristos /* First write next hash table entry; we've already calculated it. 242*3117ece4Schristos * This write is known to be safe because the ip1 is before the 243*3117ece4Schristos * repcode (ip2). */ 244*3117ece4Schristos hashTable[hash1] = (U32)(ip1 - base); 245*3117ece4Schristos 246*3117ece4Schristos goto _match; 247*3117ece4Schristos } 248*3117ece4Schristos 249*3117ece4Schristos /* load match for ip[0] */ 250*3117ece4Schristos if (idx >= prefixStartIndex) { 251*3117ece4Schristos mval = MEM_read32(base + idx); 252*3117ece4Schristos } else { 253*3117ece4Schristos mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */ 254*3117ece4Schristos } 255*3117ece4Schristos 256*3117ece4Schristos /* check match at ip[0] */ 257*3117ece4Schristos if (MEM_read32(ip0) == mval) { 258*3117ece4Schristos /* found a match! */ 259*3117ece4Schristos 260*3117ece4Schristos /* First write next hash table entry; we've already calculated it. 261*3117ece4Schristos * This write is known to be safe because the ip1 == ip0 + 1, so 262*3117ece4Schristos * we know we will resume searching after ip1 */ 263*3117ece4Schristos hashTable[hash1] = (U32)(ip1 - base); 264*3117ece4Schristos 265*3117ece4Schristos goto _offset; 266*3117ece4Schristos } 267*3117ece4Schristos 268*3117ece4Schristos /* lookup ip[1] */ 269*3117ece4Schristos idx = hashTable[hash1]; 270*3117ece4Schristos 271*3117ece4Schristos /* hash ip[2] */ 272*3117ece4Schristos hash0 = hash1; 273*3117ece4Schristos hash1 = ZSTD_hashPtr(ip2, hlog, mls); 274*3117ece4Schristos 275*3117ece4Schristos /* advance to next positions */ 276*3117ece4Schristos ip0 = ip1; 277*3117ece4Schristos ip1 = ip2; 278*3117ece4Schristos ip2 = ip3; 279*3117ece4Schristos 280*3117ece4Schristos /* write back hash table entry */ 281*3117ece4Schristos current0 = (U32)(ip0 - base); 282*3117ece4Schristos hashTable[hash0] = current0; 283*3117ece4Schristos 284*3117ece4Schristos /* load match for ip[0] */ 285*3117ece4Schristos if (idx >= prefixStartIndex) { 286*3117ece4Schristos mval = MEM_read32(base + idx); 287*3117ece4Schristos } else { 288*3117ece4Schristos mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */ 289*3117ece4Schristos } 290*3117ece4Schristos 291*3117ece4Schristos /* check match at ip[0] */ 292*3117ece4Schristos if (MEM_read32(ip0) == mval) { 293*3117ece4Schristos /* found a match! */ 294*3117ece4Schristos 295*3117ece4Schristos /* first write next hash table entry; we've already calculated it */ 296*3117ece4Schristos if (step <= 4) { 297*3117ece4Schristos /* We need to avoid writing an index into the hash table >= the 298*3117ece4Schristos * position at which we will pick up our searching after we've 299*3117ece4Schristos * taken this match. 300*3117ece4Schristos * 301*3117ece4Schristos * The minimum possible match has length 4, so the earliest ip0 302*3117ece4Schristos * can be after we take this match will be the current ip0 + 4. 303*3117ece4Schristos * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely 304*3117ece4Schristos * write this position. 305*3117ece4Schristos */ 306*3117ece4Schristos hashTable[hash1] = (U32)(ip1 - base); 307*3117ece4Schristos } 308*3117ece4Schristos 309*3117ece4Schristos goto _offset; 310*3117ece4Schristos } 311*3117ece4Schristos 312*3117ece4Schristos /* lookup ip[1] */ 313*3117ece4Schristos idx = hashTable[hash1]; 314*3117ece4Schristos 315*3117ece4Schristos /* hash ip[2] */ 316*3117ece4Schristos hash0 = hash1; 317*3117ece4Schristos hash1 = ZSTD_hashPtr(ip2, hlog, mls); 318*3117ece4Schristos 319*3117ece4Schristos /* advance to next positions */ 320*3117ece4Schristos ip0 = ip1; 321*3117ece4Schristos ip1 = ip2; 322*3117ece4Schristos ip2 = ip0 + step; 323*3117ece4Schristos ip3 = ip1 + step; 324*3117ece4Schristos 325*3117ece4Schristos /* calculate step */ 326*3117ece4Schristos if (ip2 >= nextStep) { 327*3117ece4Schristos step++; 328*3117ece4Schristos PREFETCH_L1(ip1 + 64); 329*3117ece4Schristos PREFETCH_L1(ip1 + 128); 330*3117ece4Schristos nextStep += kStepIncr; 331*3117ece4Schristos } 332*3117ece4Schristos } while (ip3 < ilimit); 333*3117ece4Schristos 334*3117ece4Schristos _cleanup: 335*3117ece4Schristos /* Note that there are probably still a couple positions we could search. 336*3117ece4Schristos * However, it seems to be a meaningful performance hit to try to search 337*3117ece4Schristos * them. So let's not. */ 338*3117ece4Schristos 339*3117ece4Schristos /* When the repcodes are outside of the prefix, we set them to zero before the loop. 340*3117ece4Schristos * When the offsets are still zero, we need to restore them after the block to have a correct 341*3117ece4Schristos * repcode history. If only one offset was invalid, it is easy. The tricky case is when both 342*3117ece4Schristos * offsets were invalid. We need to figure out which offset to refill with. 343*3117ece4Schristos * - If both offsets are zero they are in the same order. 344*3117ece4Schristos * - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`. 345*3117ece4Schristos * - If only one is zero, we need to decide which offset to restore. 346*3117ece4Schristos * - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1. 347*3117ece4Schristos * - It is impossible for rep_offset2 to be non-zero. 348*3117ece4Schristos * 349*3117ece4Schristos * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then 350*3117ece4Schristos * set rep[0] = rep_offset1 and rep[1] = offsetSaved1. 351*3117ece4Schristos */ 352*3117ece4Schristos offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2; 353*3117ece4Schristos 354*3117ece4Schristos /* save reps for next block */ 355*3117ece4Schristos rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1; 356*3117ece4Schristos rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2; 357*3117ece4Schristos 358*3117ece4Schristos /* Return the last literals size */ 359*3117ece4Schristos return (size_t)(iend - anchor); 360*3117ece4Schristos 361*3117ece4Schristos _offset: /* Requires: ip0, idx */ 362*3117ece4Schristos 363*3117ece4Schristos /* Compute the offset code. */ 364*3117ece4Schristos match0 = base + idx; 365*3117ece4Schristos rep_offset2 = rep_offset1; 366*3117ece4Schristos rep_offset1 = (U32)(ip0-match0); 367*3117ece4Schristos offcode = OFFSET_TO_OFFBASE(rep_offset1); 368*3117ece4Schristos mLength = 4; 369*3117ece4Schristos 370*3117ece4Schristos /* Count the backwards match length. */ 371*3117ece4Schristos while (((ip0>anchor) & (match0>prefixStart)) && (ip0[-1] == match0[-1])) { 372*3117ece4Schristos ip0--; 373*3117ece4Schristos match0--; 374*3117ece4Schristos mLength++; 375*3117ece4Schristos } 376*3117ece4Schristos 377*3117ece4Schristos _match: /* Requires: ip0, match0, offcode */ 378*3117ece4Schristos 379*3117ece4Schristos /* Count the forward length. */ 380*3117ece4Schristos mLength += ZSTD_count(ip0 + mLength, match0 + mLength, iend); 381*3117ece4Schristos 382*3117ece4Schristos ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength); 383*3117ece4Schristos 384*3117ece4Schristos ip0 += mLength; 385*3117ece4Schristos anchor = ip0; 386*3117ece4Schristos 387*3117ece4Schristos /* Fill table and check for immediate repcode. */ 388*3117ece4Schristos if (ip0 <= ilimit) { 389*3117ece4Schristos /* Fill Table */ 390*3117ece4Schristos assert(base+current0+2 > istart); /* check base overflow */ 391*3117ece4Schristos hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */ 392*3117ece4Schristos hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); 393*3117ece4Schristos 394*3117ece4Schristos if (rep_offset2 > 0) { /* rep_offset2==0 means rep_offset2 is invalidated */ 395*3117ece4Schristos while ( (ip0 <= ilimit) && (MEM_read32(ip0) == MEM_read32(ip0 - rep_offset2)) ) { 396*3117ece4Schristos /* store sequence */ 397*3117ece4Schristos size_t const rLength = ZSTD_count(ip0+4, ip0+4-rep_offset2, iend) + 4; 398*3117ece4Schristos { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */ 399*3117ece4Schristos hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base); 400*3117ece4Schristos ip0 += rLength; 401*3117ece4Schristos ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength); 402*3117ece4Schristos anchor = ip0; 403*3117ece4Schristos continue; /* faster when present (confirmed on gcc-8) ... (?) */ 404*3117ece4Schristos } } } 405*3117ece4Schristos 406*3117ece4Schristos goto _start; 407*3117ece4Schristos } 408*3117ece4Schristos 409*3117ece4Schristos #define ZSTD_GEN_FAST_FN(dictMode, mls, step) \ 410*3117ece4Schristos static size_t ZSTD_compressBlock_fast_##dictMode##_##mls##_##step( \ 411*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \ 412*3117ece4Schristos void const* src, size_t srcSize) \ 413*3117ece4Schristos { \ 414*3117ece4Schristos return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls, step); \ 415*3117ece4Schristos } 416*3117ece4Schristos 417*3117ece4Schristos ZSTD_GEN_FAST_FN(noDict, 4, 1) 418*3117ece4Schristos ZSTD_GEN_FAST_FN(noDict, 5, 1) 419*3117ece4Schristos ZSTD_GEN_FAST_FN(noDict, 6, 1) 420*3117ece4Schristos ZSTD_GEN_FAST_FN(noDict, 7, 1) 421*3117ece4Schristos 422*3117ece4Schristos ZSTD_GEN_FAST_FN(noDict, 4, 0) 423*3117ece4Schristos ZSTD_GEN_FAST_FN(noDict, 5, 0) 424*3117ece4Schristos ZSTD_GEN_FAST_FN(noDict, 6, 0) 425*3117ece4Schristos ZSTD_GEN_FAST_FN(noDict, 7, 0) 426*3117ece4Schristos 427*3117ece4Schristos size_t ZSTD_compressBlock_fast( 428*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 429*3117ece4Schristos void const* src, size_t srcSize) 430*3117ece4Schristos { 431*3117ece4Schristos U32 const mls = ms->cParams.minMatch; 432*3117ece4Schristos assert(ms->dictMatchState == NULL); 433*3117ece4Schristos if (ms->cParams.targetLength > 1) { 434*3117ece4Schristos switch(mls) 435*3117ece4Schristos { 436*3117ece4Schristos default: /* includes case 3 */ 437*3117ece4Schristos case 4 : 438*3117ece4Schristos return ZSTD_compressBlock_fast_noDict_4_1(ms, seqStore, rep, src, srcSize); 439*3117ece4Schristos case 5 : 440*3117ece4Schristos return ZSTD_compressBlock_fast_noDict_5_1(ms, seqStore, rep, src, srcSize); 441*3117ece4Schristos case 6 : 442*3117ece4Schristos return ZSTD_compressBlock_fast_noDict_6_1(ms, seqStore, rep, src, srcSize); 443*3117ece4Schristos case 7 : 444*3117ece4Schristos return ZSTD_compressBlock_fast_noDict_7_1(ms, seqStore, rep, src, srcSize); 445*3117ece4Schristos } 446*3117ece4Schristos } else { 447*3117ece4Schristos switch(mls) 448*3117ece4Schristos { 449*3117ece4Schristos default: /* includes case 3 */ 450*3117ece4Schristos case 4 : 451*3117ece4Schristos return ZSTD_compressBlock_fast_noDict_4_0(ms, seqStore, rep, src, srcSize); 452*3117ece4Schristos case 5 : 453*3117ece4Schristos return ZSTD_compressBlock_fast_noDict_5_0(ms, seqStore, rep, src, srcSize); 454*3117ece4Schristos case 6 : 455*3117ece4Schristos return ZSTD_compressBlock_fast_noDict_6_0(ms, seqStore, rep, src, srcSize); 456*3117ece4Schristos case 7 : 457*3117ece4Schristos return ZSTD_compressBlock_fast_noDict_7_0(ms, seqStore, rep, src, srcSize); 458*3117ece4Schristos } 459*3117ece4Schristos 460*3117ece4Schristos } 461*3117ece4Schristos } 462*3117ece4Schristos 463*3117ece4Schristos FORCE_INLINE_TEMPLATE 464*3117ece4Schristos ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 465*3117ece4Schristos size_t ZSTD_compressBlock_fast_dictMatchState_generic( 466*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 467*3117ece4Schristos void const* src, size_t srcSize, U32 const mls, U32 const hasStep) 468*3117ece4Schristos { 469*3117ece4Schristos const ZSTD_compressionParameters* const cParams = &ms->cParams; 470*3117ece4Schristos U32* const hashTable = ms->hashTable; 471*3117ece4Schristos U32 const hlog = cParams->hashLog; 472*3117ece4Schristos /* support stepSize of 0 */ 473*3117ece4Schristos U32 const stepSize = cParams->targetLength + !(cParams->targetLength); 474*3117ece4Schristos const BYTE* const base = ms->window.base; 475*3117ece4Schristos const BYTE* const istart = (const BYTE*)src; 476*3117ece4Schristos const BYTE* ip0 = istart; 477*3117ece4Schristos const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */ 478*3117ece4Schristos const BYTE* anchor = istart; 479*3117ece4Schristos const U32 prefixStartIndex = ms->window.dictLimit; 480*3117ece4Schristos const BYTE* const prefixStart = base + prefixStartIndex; 481*3117ece4Schristos const BYTE* const iend = istart + srcSize; 482*3117ece4Schristos const BYTE* const ilimit = iend - HASH_READ_SIZE; 483*3117ece4Schristos U32 offset_1=rep[0], offset_2=rep[1]; 484*3117ece4Schristos 485*3117ece4Schristos const ZSTD_matchState_t* const dms = ms->dictMatchState; 486*3117ece4Schristos const ZSTD_compressionParameters* const dictCParams = &dms->cParams ; 487*3117ece4Schristos const U32* const dictHashTable = dms->hashTable; 488*3117ece4Schristos const U32 dictStartIndex = dms->window.dictLimit; 489*3117ece4Schristos const BYTE* const dictBase = dms->window.base; 490*3117ece4Schristos const BYTE* const dictStart = dictBase + dictStartIndex; 491*3117ece4Schristos const BYTE* const dictEnd = dms->window.nextSrc; 492*3117ece4Schristos const U32 dictIndexDelta = prefixStartIndex - (U32)(dictEnd - dictBase); 493*3117ece4Schristos const U32 dictAndPrefixLength = (U32)(istart - prefixStart + dictEnd - dictStart); 494*3117ece4Schristos const U32 dictHBits = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; 495*3117ece4Schristos 496*3117ece4Schristos /* if a dictionary is still attached, it necessarily means that 497*3117ece4Schristos * it is within window size. So we just check it. */ 498*3117ece4Schristos const U32 maxDistance = 1U << cParams->windowLog; 499*3117ece4Schristos const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); 500*3117ece4Schristos assert(endIndex - prefixStartIndex <= maxDistance); 501*3117ece4Schristos (void)maxDistance; (void)endIndex; /* these variables are not used when assert() is disabled */ 502*3117ece4Schristos 503*3117ece4Schristos (void)hasStep; /* not currently specialized on whether it's accelerated */ 504*3117ece4Schristos 505*3117ece4Schristos /* ensure there will be no underflow 506*3117ece4Schristos * when translating a dict index into a local index */ 507*3117ece4Schristos assert(prefixStartIndex >= (U32)(dictEnd - dictBase)); 508*3117ece4Schristos 509*3117ece4Schristos if (ms->prefetchCDictTables) { 510*3117ece4Schristos size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32); 511*3117ece4Schristos PREFETCH_AREA(dictHashTable, hashTableBytes); 512*3117ece4Schristos } 513*3117ece4Schristos 514*3117ece4Schristos /* init */ 515*3117ece4Schristos DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic"); 516*3117ece4Schristos ip0 += (dictAndPrefixLength == 0); 517*3117ece4Schristos /* dictMatchState repCode checks don't currently handle repCode == 0 518*3117ece4Schristos * disabling. */ 519*3117ece4Schristos assert(offset_1 <= dictAndPrefixLength); 520*3117ece4Schristos assert(offset_2 <= dictAndPrefixLength); 521*3117ece4Schristos 522*3117ece4Schristos /* Outer search loop */ 523*3117ece4Schristos assert(stepSize >= 1); 524*3117ece4Schristos while (ip1 <= ilimit) { /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */ 525*3117ece4Schristos size_t mLength; 526*3117ece4Schristos size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls); 527*3117ece4Schristos 528*3117ece4Schristos size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls); 529*3117ece4Schristos U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS]; 530*3117ece4Schristos int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0); 531*3117ece4Schristos 532*3117ece4Schristos U32 matchIndex = hashTable[hash0]; 533*3117ece4Schristos U32 curr = (U32)(ip0 - base); 534*3117ece4Schristos size_t step = stepSize; 535*3117ece4Schristos const size_t kStepIncr = 1 << kSearchStrength; 536*3117ece4Schristos const BYTE* nextStep = ip0 + kStepIncr; 537*3117ece4Schristos 538*3117ece4Schristos /* Inner search loop */ 539*3117ece4Schristos while (1) { 540*3117ece4Schristos const BYTE* match = base + matchIndex; 541*3117ece4Schristos const U32 repIndex = curr + 1 - offset_1; 542*3117ece4Schristos const BYTE* repMatch = (repIndex < prefixStartIndex) ? 543*3117ece4Schristos dictBase + (repIndex - dictIndexDelta) : 544*3117ece4Schristos base + repIndex; 545*3117ece4Schristos const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls); 546*3117ece4Schristos size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls); 547*3117ece4Schristos hashTable[hash0] = curr; /* update hash table */ 548*3117ece4Schristos 549*3117ece4Schristos if (((U32) ((prefixStartIndex - 1) - repIndex) >= 550*3117ece4Schristos 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */ 551*3117ece4Schristos && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) { 552*3117ece4Schristos const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; 553*3117ece4Schristos mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4; 554*3117ece4Schristos ip0++; 555*3117ece4Schristos ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); 556*3117ece4Schristos break; 557*3117ece4Schristos } 558*3117ece4Schristos 559*3117ece4Schristos if (dictTagsMatch) { 560*3117ece4Schristos /* Found a possible dict match */ 561*3117ece4Schristos const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS; 562*3117ece4Schristos const BYTE* dictMatch = dictBase + dictMatchIndex; 563*3117ece4Schristos if (dictMatchIndex > dictStartIndex && 564*3117ece4Schristos MEM_read32(dictMatch) == MEM_read32(ip0)) { 565*3117ece4Schristos /* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */ 566*3117ece4Schristos if (matchIndex <= prefixStartIndex) { 567*3117ece4Schristos U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta); 568*3117ece4Schristos mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4; 569*3117ece4Schristos while (((ip0 > anchor) & (dictMatch > dictStart)) 570*3117ece4Schristos && (ip0[-1] == dictMatch[-1])) { 571*3117ece4Schristos ip0--; 572*3117ece4Schristos dictMatch--; 573*3117ece4Schristos mLength++; 574*3117ece4Schristos } /* catch up */ 575*3117ece4Schristos offset_2 = offset_1; 576*3117ece4Schristos offset_1 = offset; 577*3117ece4Schristos ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); 578*3117ece4Schristos break; 579*3117ece4Schristos } 580*3117ece4Schristos } 581*3117ece4Schristos } 582*3117ece4Schristos 583*3117ece4Schristos if (matchIndex > prefixStartIndex && MEM_read32(match) == MEM_read32(ip0)) { 584*3117ece4Schristos /* found a regular match */ 585*3117ece4Schristos U32 const offset = (U32) (ip0 - match); 586*3117ece4Schristos mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4; 587*3117ece4Schristos while (((ip0 > anchor) & (match > prefixStart)) 588*3117ece4Schristos && (ip0[-1] == match[-1])) { 589*3117ece4Schristos ip0--; 590*3117ece4Schristos match--; 591*3117ece4Schristos mLength++; 592*3117ece4Schristos } /* catch up */ 593*3117ece4Schristos offset_2 = offset_1; 594*3117ece4Schristos offset_1 = offset; 595*3117ece4Schristos ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); 596*3117ece4Schristos break; 597*3117ece4Schristos } 598*3117ece4Schristos 599*3117ece4Schristos /* Prepare for next iteration */ 600*3117ece4Schristos dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS]; 601*3117ece4Schristos dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1); 602*3117ece4Schristos matchIndex = hashTable[hash1]; 603*3117ece4Schristos 604*3117ece4Schristos if (ip1 >= nextStep) { 605*3117ece4Schristos step++; 606*3117ece4Schristos nextStep += kStepIncr; 607*3117ece4Schristos } 608*3117ece4Schristos ip0 = ip1; 609*3117ece4Schristos ip1 = ip1 + step; 610*3117ece4Schristos if (ip1 > ilimit) goto _cleanup; 611*3117ece4Schristos 612*3117ece4Schristos curr = (U32)(ip0 - base); 613*3117ece4Schristos hash0 = hash1; 614*3117ece4Schristos } /* end inner search loop */ 615*3117ece4Schristos 616*3117ece4Schristos /* match found */ 617*3117ece4Schristos assert(mLength); 618*3117ece4Schristos ip0 += mLength; 619*3117ece4Schristos anchor = ip0; 620*3117ece4Schristos 621*3117ece4Schristos if (ip0 <= ilimit) { 622*3117ece4Schristos /* Fill Table */ 623*3117ece4Schristos assert(base+curr+2 > istart); /* check base overflow */ 624*3117ece4Schristos hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2; /* here because curr+2 could be > iend-8 */ 625*3117ece4Schristos hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); 626*3117ece4Schristos 627*3117ece4Schristos /* check immediate repcode */ 628*3117ece4Schristos while (ip0 <= ilimit) { 629*3117ece4Schristos U32 const current2 = (U32)(ip0-base); 630*3117ece4Schristos U32 const repIndex2 = current2 - offset_2; 631*3117ece4Schristos const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? 632*3117ece4Schristos dictBase - dictIndexDelta + repIndex2 : 633*3117ece4Schristos base + repIndex2; 634*3117ece4Schristos if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */) 635*3117ece4Schristos && (MEM_read32(repMatch2) == MEM_read32(ip0))) { 636*3117ece4Schristos const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; 637*3117ece4Schristos size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; 638*3117ece4Schristos U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ 639*3117ece4Schristos ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); 640*3117ece4Schristos hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2; 641*3117ece4Schristos ip0 += repLength2; 642*3117ece4Schristos anchor = ip0; 643*3117ece4Schristos continue; 644*3117ece4Schristos } 645*3117ece4Schristos break; 646*3117ece4Schristos } 647*3117ece4Schristos } 648*3117ece4Schristos 649*3117ece4Schristos /* Prepare for next iteration */ 650*3117ece4Schristos assert(ip0 == anchor); 651*3117ece4Schristos ip1 = ip0 + stepSize; 652*3117ece4Schristos } 653*3117ece4Schristos 654*3117ece4Schristos _cleanup: 655*3117ece4Schristos /* save reps for next block */ 656*3117ece4Schristos rep[0] = offset_1; 657*3117ece4Schristos rep[1] = offset_2; 658*3117ece4Schristos 659*3117ece4Schristos /* Return the last literals size */ 660*3117ece4Schristos return (size_t)(iend - anchor); 661*3117ece4Schristos } 662*3117ece4Schristos 663*3117ece4Schristos 664*3117ece4Schristos ZSTD_GEN_FAST_FN(dictMatchState, 4, 0) 665*3117ece4Schristos ZSTD_GEN_FAST_FN(dictMatchState, 5, 0) 666*3117ece4Schristos ZSTD_GEN_FAST_FN(dictMatchState, 6, 0) 667*3117ece4Schristos ZSTD_GEN_FAST_FN(dictMatchState, 7, 0) 668*3117ece4Schristos 669*3117ece4Schristos size_t ZSTD_compressBlock_fast_dictMatchState( 670*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 671*3117ece4Schristos void const* src, size_t srcSize) 672*3117ece4Schristos { 673*3117ece4Schristos U32 const mls = ms->cParams.minMatch; 674*3117ece4Schristos assert(ms->dictMatchState != NULL); 675*3117ece4Schristos switch(mls) 676*3117ece4Schristos { 677*3117ece4Schristos default: /* includes case 3 */ 678*3117ece4Schristos case 4 : 679*3117ece4Schristos return ZSTD_compressBlock_fast_dictMatchState_4_0(ms, seqStore, rep, src, srcSize); 680*3117ece4Schristos case 5 : 681*3117ece4Schristos return ZSTD_compressBlock_fast_dictMatchState_5_0(ms, seqStore, rep, src, srcSize); 682*3117ece4Schristos case 6 : 683*3117ece4Schristos return ZSTD_compressBlock_fast_dictMatchState_6_0(ms, seqStore, rep, src, srcSize); 684*3117ece4Schristos case 7 : 685*3117ece4Schristos return ZSTD_compressBlock_fast_dictMatchState_7_0(ms, seqStore, rep, src, srcSize); 686*3117ece4Schristos } 687*3117ece4Schristos } 688*3117ece4Schristos 689*3117ece4Schristos 690*3117ece4Schristos static 691*3117ece4Schristos ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 692*3117ece4Schristos size_t ZSTD_compressBlock_fast_extDict_generic( 693*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 694*3117ece4Schristos void const* src, size_t srcSize, U32 const mls, U32 const hasStep) 695*3117ece4Schristos { 696*3117ece4Schristos const ZSTD_compressionParameters* const cParams = &ms->cParams; 697*3117ece4Schristos U32* const hashTable = ms->hashTable; 698*3117ece4Schristos U32 const hlog = cParams->hashLog; 699*3117ece4Schristos /* support stepSize of 0 */ 700*3117ece4Schristos size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1; 701*3117ece4Schristos const BYTE* const base = ms->window.base; 702*3117ece4Schristos const BYTE* const dictBase = ms->window.dictBase; 703*3117ece4Schristos const BYTE* const istart = (const BYTE*)src; 704*3117ece4Schristos const BYTE* anchor = istart; 705*3117ece4Schristos const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); 706*3117ece4Schristos const U32 lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog); 707*3117ece4Schristos const U32 dictStartIndex = lowLimit; 708*3117ece4Schristos const BYTE* const dictStart = dictBase + dictStartIndex; 709*3117ece4Schristos const U32 dictLimit = ms->window.dictLimit; 710*3117ece4Schristos const U32 prefixStartIndex = dictLimit < lowLimit ? lowLimit : dictLimit; 711*3117ece4Schristos const BYTE* const prefixStart = base + prefixStartIndex; 712*3117ece4Schristos const BYTE* const dictEnd = dictBase + prefixStartIndex; 713*3117ece4Schristos const BYTE* const iend = istart + srcSize; 714*3117ece4Schristos const BYTE* const ilimit = iend - 8; 715*3117ece4Schristos U32 offset_1=rep[0], offset_2=rep[1]; 716*3117ece4Schristos U32 offsetSaved1 = 0, offsetSaved2 = 0; 717*3117ece4Schristos 718*3117ece4Schristos const BYTE* ip0 = istart; 719*3117ece4Schristos const BYTE* ip1; 720*3117ece4Schristos const BYTE* ip2; 721*3117ece4Schristos const BYTE* ip3; 722*3117ece4Schristos U32 current0; 723*3117ece4Schristos 724*3117ece4Schristos 725*3117ece4Schristos size_t hash0; /* hash for ip0 */ 726*3117ece4Schristos size_t hash1; /* hash for ip1 */ 727*3117ece4Schristos U32 idx; /* match idx for ip0 */ 728*3117ece4Schristos const BYTE* idxBase; /* base pointer for idx */ 729*3117ece4Schristos 730*3117ece4Schristos U32 offcode; 731*3117ece4Schristos const BYTE* match0; 732*3117ece4Schristos size_t mLength; 733*3117ece4Schristos const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */ 734*3117ece4Schristos 735*3117ece4Schristos size_t step; 736*3117ece4Schristos const BYTE* nextStep; 737*3117ece4Schristos const size_t kStepIncr = (1 << (kSearchStrength - 1)); 738*3117ece4Schristos 739*3117ece4Schristos (void)hasStep; /* not currently specialized on whether it's accelerated */ 740*3117ece4Schristos 741*3117ece4Schristos DEBUGLOG(5, "ZSTD_compressBlock_fast_extDict_generic (offset_1=%u)", offset_1); 742*3117ece4Schristos 743*3117ece4Schristos /* switch to "regular" variant if extDict is invalidated due to maxDistance */ 744*3117ece4Schristos if (prefixStartIndex == dictStartIndex) 745*3117ece4Schristos return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize); 746*3117ece4Schristos 747*3117ece4Schristos { U32 const curr = (U32)(ip0 - base); 748*3117ece4Schristos U32 const maxRep = curr - dictStartIndex; 749*3117ece4Schristos if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = 0; 750*3117ece4Schristos if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = 0; 751*3117ece4Schristos } 752*3117ece4Schristos 753*3117ece4Schristos /* start each op */ 754*3117ece4Schristos _start: /* Requires: ip0 */ 755*3117ece4Schristos 756*3117ece4Schristos step = stepSize; 757*3117ece4Schristos nextStep = ip0 + kStepIncr; 758*3117ece4Schristos 759*3117ece4Schristos /* calculate positions, ip0 - anchor == 0, so we skip step calc */ 760*3117ece4Schristos ip1 = ip0 + 1; 761*3117ece4Schristos ip2 = ip0 + step; 762*3117ece4Schristos ip3 = ip2 + 1; 763*3117ece4Schristos 764*3117ece4Schristos if (ip3 >= ilimit) { 765*3117ece4Schristos goto _cleanup; 766*3117ece4Schristos } 767*3117ece4Schristos 768*3117ece4Schristos hash0 = ZSTD_hashPtr(ip0, hlog, mls); 769*3117ece4Schristos hash1 = ZSTD_hashPtr(ip1, hlog, mls); 770*3117ece4Schristos 771*3117ece4Schristos idx = hashTable[hash0]; 772*3117ece4Schristos idxBase = idx < prefixStartIndex ? dictBase : base; 773*3117ece4Schristos 774*3117ece4Schristos do { 775*3117ece4Schristos { /* load repcode match for ip[2] */ 776*3117ece4Schristos U32 const current2 = (U32)(ip2 - base); 777*3117ece4Schristos U32 const repIndex = current2 - offset_1; 778*3117ece4Schristos const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base; 779*3117ece4Schristos U32 rval; 780*3117ece4Schristos if ( ((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */ 781*3117ece4Schristos & (offset_1 > 0) ) { 782*3117ece4Schristos rval = MEM_read32(repBase + repIndex); 783*3117ece4Schristos } else { 784*3117ece4Schristos rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */ 785*3117ece4Schristos } 786*3117ece4Schristos 787*3117ece4Schristos /* write back hash table entry */ 788*3117ece4Schristos current0 = (U32)(ip0 - base); 789*3117ece4Schristos hashTable[hash0] = current0; 790*3117ece4Schristos 791*3117ece4Schristos /* check repcode at ip[2] */ 792*3117ece4Schristos if (MEM_read32(ip2) == rval) { 793*3117ece4Schristos ip0 = ip2; 794*3117ece4Schristos match0 = repBase + repIndex; 795*3117ece4Schristos matchEnd = repIndex < prefixStartIndex ? dictEnd : iend; 796*3117ece4Schristos assert((match0 != prefixStart) & (match0 != dictStart)); 797*3117ece4Schristos mLength = ip0[-1] == match0[-1]; 798*3117ece4Schristos ip0 -= mLength; 799*3117ece4Schristos match0 -= mLength; 800*3117ece4Schristos offcode = REPCODE1_TO_OFFBASE; 801*3117ece4Schristos mLength += 4; 802*3117ece4Schristos goto _match; 803*3117ece4Schristos } } 804*3117ece4Schristos 805*3117ece4Schristos { /* load match for ip[0] */ 806*3117ece4Schristos U32 const mval = idx >= dictStartIndex ? 807*3117ece4Schristos MEM_read32(idxBase + idx) : 808*3117ece4Schristos MEM_read32(ip0) ^ 1; /* guaranteed not to match */ 809*3117ece4Schristos 810*3117ece4Schristos /* check match at ip[0] */ 811*3117ece4Schristos if (MEM_read32(ip0) == mval) { 812*3117ece4Schristos /* found a match! */ 813*3117ece4Schristos goto _offset; 814*3117ece4Schristos } } 815*3117ece4Schristos 816*3117ece4Schristos /* lookup ip[1] */ 817*3117ece4Schristos idx = hashTable[hash1]; 818*3117ece4Schristos idxBase = idx < prefixStartIndex ? dictBase : base; 819*3117ece4Schristos 820*3117ece4Schristos /* hash ip[2] */ 821*3117ece4Schristos hash0 = hash1; 822*3117ece4Schristos hash1 = ZSTD_hashPtr(ip2, hlog, mls); 823*3117ece4Schristos 824*3117ece4Schristos /* advance to next positions */ 825*3117ece4Schristos ip0 = ip1; 826*3117ece4Schristos ip1 = ip2; 827*3117ece4Schristos ip2 = ip3; 828*3117ece4Schristos 829*3117ece4Schristos /* write back hash table entry */ 830*3117ece4Schristos current0 = (U32)(ip0 - base); 831*3117ece4Schristos hashTable[hash0] = current0; 832*3117ece4Schristos 833*3117ece4Schristos { /* load match for ip[0] */ 834*3117ece4Schristos U32 const mval = idx >= dictStartIndex ? 835*3117ece4Schristos MEM_read32(idxBase + idx) : 836*3117ece4Schristos MEM_read32(ip0) ^ 1; /* guaranteed not to match */ 837*3117ece4Schristos 838*3117ece4Schristos /* check match at ip[0] */ 839*3117ece4Schristos if (MEM_read32(ip0) == mval) { 840*3117ece4Schristos /* found a match! */ 841*3117ece4Schristos goto _offset; 842*3117ece4Schristos } } 843*3117ece4Schristos 844*3117ece4Schristos /* lookup ip[1] */ 845*3117ece4Schristos idx = hashTable[hash1]; 846*3117ece4Schristos idxBase = idx < prefixStartIndex ? dictBase : base; 847*3117ece4Schristos 848*3117ece4Schristos /* hash ip[2] */ 849*3117ece4Schristos hash0 = hash1; 850*3117ece4Schristos hash1 = ZSTD_hashPtr(ip2, hlog, mls); 851*3117ece4Schristos 852*3117ece4Schristos /* advance to next positions */ 853*3117ece4Schristos ip0 = ip1; 854*3117ece4Schristos ip1 = ip2; 855*3117ece4Schristos ip2 = ip0 + step; 856*3117ece4Schristos ip3 = ip1 + step; 857*3117ece4Schristos 858*3117ece4Schristos /* calculate step */ 859*3117ece4Schristos if (ip2 >= nextStep) { 860*3117ece4Schristos step++; 861*3117ece4Schristos PREFETCH_L1(ip1 + 64); 862*3117ece4Schristos PREFETCH_L1(ip1 + 128); 863*3117ece4Schristos nextStep += kStepIncr; 864*3117ece4Schristos } 865*3117ece4Schristos } while (ip3 < ilimit); 866*3117ece4Schristos 867*3117ece4Schristos _cleanup: 868*3117ece4Schristos /* Note that there are probably still a couple positions we could search. 869*3117ece4Schristos * However, it seems to be a meaningful performance hit to try to search 870*3117ece4Schristos * them. So let's not. */ 871*3117ece4Schristos 872*3117ece4Schristos /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), 873*3117ece4Schristos * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ 874*3117ece4Schristos offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; 875*3117ece4Schristos 876*3117ece4Schristos /* save reps for next block */ 877*3117ece4Schristos rep[0] = offset_1 ? offset_1 : offsetSaved1; 878*3117ece4Schristos rep[1] = offset_2 ? offset_2 : offsetSaved2; 879*3117ece4Schristos 880*3117ece4Schristos /* Return the last literals size */ 881*3117ece4Schristos return (size_t)(iend - anchor); 882*3117ece4Schristos 883*3117ece4Schristos _offset: /* Requires: ip0, idx, idxBase */ 884*3117ece4Schristos 885*3117ece4Schristos /* Compute the offset code. */ 886*3117ece4Schristos { U32 const offset = current0 - idx; 887*3117ece4Schristos const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart; 888*3117ece4Schristos matchEnd = idx < prefixStartIndex ? dictEnd : iend; 889*3117ece4Schristos match0 = idxBase + idx; 890*3117ece4Schristos offset_2 = offset_1; 891*3117ece4Schristos offset_1 = offset; 892*3117ece4Schristos offcode = OFFSET_TO_OFFBASE(offset); 893*3117ece4Schristos mLength = 4; 894*3117ece4Schristos 895*3117ece4Schristos /* Count the backwards match length. */ 896*3117ece4Schristos while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-1] == match0[-1])) { 897*3117ece4Schristos ip0--; 898*3117ece4Schristos match0--; 899*3117ece4Schristos mLength++; 900*3117ece4Schristos } } 901*3117ece4Schristos 902*3117ece4Schristos _match: /* Requires: ip0, match0, offcode, matchEnd */ 903*3117ece4Schristos 904*3117ece4Schristos /* Count the forward length. */ 905*3117ece4Schristos assert(matchEnd != 0); 906*3117ece4Schristos mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart); 907*3117ece4Schristos 908*3117ece4Schristos ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength); 909*3117ece4Schristos 910*3117ece4Schristos ip0 += mLength; 911*3117ece4Schristos anchor = ip0; 912*3117ece4Schristos 913*3117ece4Schristos /* write next hash table entry */ 914*3117ece4Schristos if (ip1 < ip0) { 915*3117ece4Schristos hashTable[hash1] = (U32)(ip1 - base); 916*3117ece4Schristos } 917*3117ece4Schristos 918*3117ece4Schristos /* Fill table and check for immediate repcode. */ 919*3117ece4Schristos if (ip0 <= ilimit) { 920*3117ece4Schristos /* Fill Table */ 921*3117ece4Schristos assert(base+current0+2 > istart); /* check base overflow */ 922*3117ece4Schristos hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */ 923*3117ece4Schristos hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); 924*3117ece4Schristos 925*3117ece4Schristos while (ip0 <= ilimit) { 926*3117ece4Schristos U32 const repIndex2 = (U32)(ip0-base) - offset_2; 927*3117ece4Schristos const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; 928*3117ece4Schristos if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 > 0)) /* intentional underflow */ 929*3117ece4Schristos && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) { 930*3117ece4Schristos const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; 931*3117ece4Schristos size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; 932*3117ece4Schristos { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */ 933*3117ece4Schristos ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); 934*3117ece4Schristos hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base); 935*3117ece4Schristos ip0 += repLength2; 936*3117ece4Schristos anchor = ip0; 937*3117ece4Schristos continue; 938*3117ece4Schristos } 939*3117ece4Schristos break; 940*3117ece4Schristos } } 941*3117ece4Schristos 942*3117ece4Schristos goto _start; 943*3117ece4Schristos } 944*3117ece4Schristos 945*3117ece4Schristos ZSTD_GEN_FAST_FN(extDict, 4, 0) 946*3117ece4Schristos ZSTD_GEN_FAST_FN(extDict, 5, 0) 947*3117ece4Schristos ZSTD_GEN_FAST_FN(extDict, 6, 0) 948*3117ece4Schristos ZSTD_GEN_FAST_FN(extDict, 7, 0) 949*3117ece4Schristos 950*3117ece4Schristos size_t ZSTD_compressBlock_fast_extDict( 951*3117ece4Schristos ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 952*3117ece4Schristos void const* src, size_t srcSize) 953*3117ece4Schristos { 954*3117ece4Schristos U32 const mls = ms->cParams.minMatch; 955*3117ece4Schristos assert(ms->dictMatchState == NULL); 956*3117ece4Schristos switch(mls) 957*3117ece4Schristos { 958*3117ece4Schristos default: /* includes case 3 */ 959*3117ece4Schristos case 4 : 960*3117ece4Schristos return ZSTD_compressBlock_fast_extDict_4_0(ms, seqStore, rep, src, srcSize); 961*3117ece4Schristos case 5 : 962*3117ece4Schristos return ZSTD_compressBlock_fast_extDict_5_0(ms, seqStore, rep, src, srcSize); 963*3117ece4Schristos case 6 : 964*3117ece4Schristos return ZSTD_compressBlock_fast_extDict_6_0(ms, seqStore, rep, src, srcSize); 965*3117ece4Schristos case 7 : 966*3117ece4Schristos return ZSTD_compressBlock_fast_extDict_7_0(ms, seqStore, rep, src, srcSize); 967*3117ece4Schristos } 968*3117ece4Schristos } 969