1*3117ece4Schristos /* 2*3117ece4Schristos * Copyright (c) Meta Platforms, Inc. and affiliates. 3*3117ece4Schristos * All rights reserved. 4*3117ece4Schristos * 5*3117ece4Schristos * This source code is licensed under both the BSD-style license (found in the 6*3117ece4Schristos * LICENSE file in the root directory of this source tree) and the GPLv2 (found 7*3117ece4Schristos * in the COPYING file in the root directory of this source tree). 8*3117ece4Schristos * You may select, at your option, one of the above-listed licenses. 9*3117ece4Schristos */ 10*3117ece4Schristos 11*3117ece4Schristos /* zstd_decompress_block : 12*3117ece4Schristos * this module takes care of decompressing _compressed_ block */ 13*3117ece4Schristos 14*3117ece4Schristos /*-******************************************************* 15*3117ece4Schristos * Dependencies 16*3117ece4Schristos *********************************************************/ 17*3117ece4Schristos #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ 18*3117ece4Schristos #include "../common/compiler.h" /* prefetch */ 19*3117ece4Schristos #include "../common/cpu.h" /* bmi2 */ 20*3117ece4Schristos #include "../common/mem.h" /* low level memory routines */ 21*3117ece4Schristos #define FSE_STATIC_LINKING_ONLY 22*3117ece4Schristos #include "../common/fse.h" 23*3117ece4Schristos #include "../common/huf.h" 24*3117ece4Schristos #include "../common/zstd_internal.h" 25*3117ece4Schristos #include "zstd_decompress_internal.h" /* ZSTD_DCtx */ 26*3117ece4Schristos #include "zstd_ddict.h" /* ZSTD_DDictDictContent */ 27*3117ece4Schristos #include "zstd_decompress_block.h" 28*3117ece4Schristos #include "../common/bits.h" /* ZSTD_highbit32 */ 29*3117ece4Schristos 30*3117ece4Schristos /*_******************************************************* 31*3117ece4Schristos * Macros 32*3117ece4Schristos **********************************************************/ 33*3117ece4Schristos 34*3117ece4Schristos /* These two optional macros force the use one way or another of the two 35*3117ece4Schristos * ZSTD_decompressSequences implementations. You can't force in both directions 36*3117ece4Schristos * at the same time. 37*3117ece4Schristos */ 38*3117ece4Schristos #if defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ 39*3117ece4Schristos defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) 40*3117ece4Schristos #error "Cannot force the use of the short and the long ZSTD_decompressSequences variants!" 41*3117ece4Schristos #endif 42*3117ece4Schristos 43*3117ece4Schristos 44*3117ece4Schristos /*_******************************************************* 45*3117ece4Schristos * Memory operations 46*3117ece4Schristos **********************************************************/ 47*3117ece4Schristos static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); } 48*3117ece4Schristos 49*3117ece4Schristos 50*3117ece4Schristos /*-************************************************************* 51*3117ece4Schristos * Block decoding 52*3117ece4Schristos ***************************************************************/ 53*3117ece4Schristos 54*3117ece4Schristos static size_t ZSTD_blockSizeMax(ZSTD_DCtx const* dctx) 55*3117ece4Schristos { 56*3117ece4Schristos size_t const blockSizeMax = dctx->isFrameDecompression ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX; 57*3117ece4Schristos assert(blockSizeMax <= ZSTD_BLOCKSIZE_MAX); 58*3117ece4Schristos return blockSizeMax; 59*3117ece4Schristos } 60*3117ece4Schristos 61*3117ece4Schristos /*! ZSTD_getcBlockSize() : 62*3117ece4Schristos * Provides the size of compressed block from block header `src` */ 63*3117ece4Schristos size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, 64*3117ece4Schristos blockProperties_t* bpPtr) 65*3117ece4Schristos { 66*3117ece4Schristos RETURN_ERROR_IF(srcSize < ZSTD_blockHeaderSize, srcSize_wrong, ""); 67*3117ece4Schristos 68*3117ece4Schristos { U32 const cBlockHeader = MEM_readLE24(src); 69*3117ece4Schristos U32 const cSize = cBlockHeader >> 3; 70*3117ece4Schristos bpPtr->lastBlock = cBlockHeader & 1; 71*3117ece4Schristos bpPtr->blockType = (blockType_e)((cBlockHeader >> 1) & 3); 72*3117ece4Schristos bpPtr->origSize = cSize; /* only useful for RLE */ 73*3117ece4Schristos if (bpPtr->blockType == bt_rle) return 1; 74*3117ece4Schristos RETURN_ERROR_IF(bpPtr->blockType == bt_reserved, corruption_detected, ""); 75*3117ece4Schristos return cSize; 76*3117ece4Schristos } 77*3117ece4Schristos } 78*3117ece4Schristos 79*3117ece4Schristos /* Allocate buffer for literals, either overlapping current dst, or split between dst and litExtraBuffer, or stored entirely within litExtraBuffer */ 80*3117ece4Schristos static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize, 81*3117ece4Schristos const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately) 82*3117ece4Schristos { 83*3117ece4Schristos size_t const blockSizeMax = ZSTD_blockSizeMax(dctx); 84*3117ece4Schristos assert(litSize <= blockSizeMax); 85*3117ece4Schristos assert(dctx->isFrameDecompression || streaming == not_streaming); 86*3117ece4Schristos assert(expectedWriteSize <= blockSizeMax); 87*3117ece4Schristos if (streaming == not_streaming && dstCapacity > blockSizeMax + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) { 88*3117ece4Schristos /* If we aren't streaming, we can just put the literals after the output 89*3117ece4Schristos * of the current block. We don't need to worry about overwriting the 90*3117ece4Schristos * extDict of our window, because it doesn't exist. 91*3117ece4Schristos * So if we have space after the end of the block, just put it there. 92*3117ece4Schristos */ 93*3117ece4Schristos dctx->litBuffer = (BYTE*)dst + blockSizeMax + WILDCOPY_OVERLENGTH; 94*3117ece4Schristos dctx->litBufferEnd = dctx->litBuffer + litSize; 95*3117ece4Schristos dctx->litBufferLocation = ZSTD_in_dst; 96*3117ece4Schristos } else if (litSize <= ZSTD_LITBUFFEREXTRASIZE) { 97*3117ece4Schristos /* Literals fit entirely within the extra buffer, put them there to avoid 98*3117ece4Schristos * having to split the literals. 99*3117ece4Schristos */ 100*3117ece4Schristos dctx->litBuffer = dctx->litExtraBuffer; 101*3117ece4Schristos dctx->litBufferEnd = dctx->litBuffer + litSize; 102*3117ece4Schristos dctx->litBufferLocation = ZSTD_not_in_dst; 103*3117ece4Schristos } else { 104*3117ece4Schristos assert(blockSizeMax > ZSTD_LITBUFFEREXTRASIZE); 105*3117ece4Schristos /* Literals must be split between the output block and the extra lit 106*3117ece4Schristos * buffer. We fill the extra lit buffer with the tail of the literals, 107*3117ece4Schristos * and put the rest of the literals at the end of the block, with 108*3117ece4Schristos * WILDCOPY_OVERLENGTH of buffer room to allow for overreads. 109*3117ece4Schristos * This MUST not write more than our maxBlockSize beyond dst, because in 110*3117ece4Schristos * streaming mode, that could overwrite part of our extDict window. 111*3117ece4Schristos */ 112*3117ece4Schristos if (splitImmediately) { 113*3117ece4Schristos /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */ 114*3117ece4Schristos dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; 115*3117ece4Schristos dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE; 116*3117ece4Schristos } else { 117*3117ece4Schristos /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */ 118*3117ece4Schristos dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize; 119*3117ece4Schristos dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize; 120*3117ece4Schristos } 121*3117ece4Schristos dctx->litBufferLocation = ZSTD_split; 122*3117ece4Schristos assert(dctx->litBufferEnd <= (BYTE*)dst + expectedWriteSize); 123*3117ece4Schristos } 124*3117ece4Schristos } 125*3117ece4Schristos 126*3117ece4Schristos /*! ZSTD_decodeLiteralsBlock() : 127*3117ece4Schristos * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored 128*3117ece4Schristos * in the dstBuffer. If there is room to do so, it will be stored in full in the excess dst space after where the current 129*3117ece4Schristos * block will be output. Otherwise it will be stored at the end of the current dst blockspace, with a small portion being 130*3117ece4Schristos * stored in dctx->litExtraBuffer to help keep it "ahead" of the current output write. 131*3117ece4Schristos * 132*3117ece4Schristos * @return : nb of bytes read from src (< srcSize ) 133*3117ece4Schristos * note : symbol not declared but exposed for fullbench */ 134*3117ece4Schristos static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, 135*3117ece4Schristos const void* src, size_t srcSize, /* note : srcSize < BLOCKSIZE */ 136*3117ece4Schristos void* dst, size_t dstCapacity, const streaming_operation streaming) 137*3117ece4Schristos { 138*3117ece4Schristos DEBUGLOG(5, "ZSTD_decodeLiteralsBlock"); 139*3117ece4Schristos RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, ""); 140*3117ece4Schristos 141*3117ece4Schristos { const BYTE* const istart = (const BYTE*) src; 142*3117ece4Schristos symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3); 143*3117ece4Schristos size_t const blockSizeMax = ZSTD_blockSizeMax(dctx); 144*3117ece4Schristos 145*3117ece4Schristos switch(litEncType) 146*3117ece4Schristos { 147*3117ece4Schristos case set_repeat: 148*3117ece4Schristos DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block"); 149*3117ece4Schristos RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted, ""); 150*3117ece4Schristos ZSTD_FALLTHROUGH; 151*3117ece4Schristos 152*3117ece4Schristos case set_compressed: 153*3117ece4Schristos RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3"); 154*3117ece4Schristos { size_t lhSize, litSize, litCSize; 155*3117ece4Schristos U32 singleStream=0; 156*3117ece4Schristos U32 const lhlCode = (istart[0] >> 2) & 3; 157*3117ece4Schristos U32 const lhc = MEM_readLE32(istart); 158*3117ece4Schristos size_t hufSuccess; 159*3117ece4Schristos size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); 160*3117ece4Schristos int const flags = 0 161*3117ece4Schristos | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0) 162*3117ece4Schristos | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0); 163*3117ece4Schristos switch(lhlCode) 164*3117ece4Schristos { 165*3117ece4Schristos case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */ 166*3117ece4Schristos /* 2 - 2 - 10 - 10 */ 167*3117ece4Schristos singleStream = !lhlCode; 168*3117ece4Schristos lhSize = 3; 169*3117ece4Schristos litSize = (lhc >> 4) & 0x3FF; 170*3117ece4Schristos litCSize = (lhc >> 14) & 0x3FF; 171*3117ece4Schristos break; 172*3117ece4Schristos case 2: 173*3117ece4Schristos /* 2 - 2 - 14 - 14 */ 174*3117ece4Schristos lhSize = 4; 175*3117ece4Schristos litSize = (lhc >> 4) & 0x3FFF; 176*3117ece4Schristos litCSize = lhc >> 18; 177*3117ece4Schristos break; 178*3117ece4Schristos case 3: 179*3117ece4Schristos /* 2 - 2 - 18 - 18 */ 180*3117ece4Schristos lhSize = 5; 181*3117ece4Schristos litSize = (lhc >> 4) & 0x3FFFF; 182*3117ece4Schristos litCSize = (lhc >> 22) + ((size_t)istart[4] << 10); 183*3117ece4Schristos break; 184*3117ece4Schristos } 185*3117ece4Schristos RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); 186*3117ece4Schristos RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); 187*3117ece4Schristos if (!singleStream) 188*3117ece4Schristos RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong, 189*3117ece4Schristos "Not enough literals (%zu) for the 4-streams mode (min %u)", 190*3117ece4Schristos litSize, MIN_LITERALS_FOR_4_STREAMS); 191*3117ece4Schristos RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, ""); 192*3117ece4Schristos RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, ""); 193*3117ece4Schristos ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0); 194*3117ece4Schristos 195*3117ece4Schristos /* prefetch huffman table if cold */ 196*3117ece4Schristos if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) { 197*3117ece4Schristos PREFETCH_AREA(dctx->HUFptr, sizeof(dctx->entropy.hufTable)); 198*3117ece4Schristos } 199*3117ece4Schristos 200*3117ece4Schristos if (litEncType==set_repeat) { 201*3117ece4Schristos if (singleStream) { 202*3117ece4Schristos hufSuccess = HUF_decompress1X_usingDTable( 203*3117ece4Schristos dctx->litBuffer, litSize, istart+lhSize, litCSize, 204*3117ece4Schristos dctx->HUFptr, flags); 205*3117ece4Schristos } else { 206*3117ece4Schristos assert(litSize >= MIN_LITERALS_FOR_4_STREAMS); 207*3117ece4Schristos hufSuccess = HUF_decompress4X_usingDTable( 208*3117ece4Schristos dctx->litBuffer, litSize, istart+lhSize, litCSize, 209*3117ece4Schristos dctx->HUFptr, flags); 210*3117ece4Schristos } 211*3117ece4Schristos } else { 212*3117ece4Schristos if (singleStream) { 213*3117ece4Schristos #if defined(HUF_FORCE_DECOMPRESS_X2) 214*3117ece4Schristos hufSuccess = HUF_decompress1X_DCtx_wksp( 215*3117ece4Schristos dctx->entropy.hufTable, dctx->litBuffer, litSize, 216*3117ece4Schristos istart+lhSize, litCSize, dctx->workspace, 217*3117ece4Schristos sizeof(dctx->workspace), flags); 218*3117ece4Schristos #else 219*3117ece4Schristos hufSuccess = HUF_decompress1X1_DCtx_wksp( 220*3117ece4Schristos dctx->entropy.hufTable, dctx->litBuffer, litSize, 221*3117ece4Schristos istart+lhSize, litCSize, dctx->workspace, 222*3117ece4Schristos sizeof(dctx->workspace), flags); 223*3117ece4Schristos #endif 224*3117ece4Schristos } else { 225*3117ece4Schristos hufSuccess = HUF_decompress4X_hufOnly_wksp( 226*3117ece4Schristos dctx->entropy.hufTable, dctx->litBuffer, litSize, 227*3117ece4Schristos istart+lhSize, litCSize, dctx->workspace, 228*3117ece4Schristos sizeof(dctx->workspace), flags); 229*3117ece4Schristos } 230*3117ece4Schristos } 231*3117ece4Schristos if (dctx->litBufferLocation == ZSTD_split) 232*3117ece4Schristos { 233*3117ece4Schristos assert(litSize > ZSTD_LITBUFFEREXTRASIZE); 234*3117ece4Schristos ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE); 235*3117ece4Schristos ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE); 236*3117ece4Schristos dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; 237*3117ece4Schristos dctx->litBufferEnd -= WILDCOPY_OVERLENGTH; 238*3117ece4Schristos assert(dctx->litBufferEnd <= (BYTE*)dst + blockSizeMax); 239*3117ece4Schristos } 240*3117ece4Schristos 241*3117ece4Schristos RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, ""); 242*3117ece4Schristos 243*3117ece4Schristos dctx->litPtr = dctx->litBuffer; 244*3117ece4Schristos dctx->litSize = litSize; 245*3117ece4Schristos dctx->litEntropy = 1; 246*3117ece4Schristos if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable; 247*3117ece4Schristos return litCSize + lhSize; 248*3117ece4Schristos } 249*3117ece4Schristos 250*3117ece4Schristos case set_basic: 251*3117ece4Schristos { size_t litSize, lhSize; 252*3117ece4Schristos U32 const lhlCode = ((istart[0]) >> 2) & 3; 253*3117ece4Schristos size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); 254*3117ece4Schristos switch(lhlCode) 255*3117ece4Schristos { 256*3117ece4Schristos case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ 257*3117ece4Schristos lhSize = 1; 258*3117ece4Schristos litSize = istart[0] >> 3; 259*3117ece4Schristos break; 260*3117ece4Schristos case 1: 261*3117ece4Schristos lhSize = 2; 262*3117ece4Schristos litSize = MEM_readLE16(istart) >> 4; 263*3117ece4Schristos break; 264*3117ece4Schristos case 3: 265*3117ece4Schristos lhSize = 3; 266*3117ece4Schristos RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3"); 267*3117ece4Schristos litSize = MEM_readLE24(istart) >> 4; 268*3117ece4Schristos break; 269*3117ece4Schristos } 270*3117ece4Schristos 271*3117ece4Schristos RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); 272*3117ece4Schristos RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); 273*3117ece4Schristos RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); 274*3117ece4Schristos ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); 275*3117ece4Schristos if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */ 276*3117ece4Schristos RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, ""); 277*3117ece4Schristos if (dctx->litBufferLocation == ZSTD_split) 278*3117ece4Schristos { 279*3117ece4Schristos ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize - ZSTD_LITBUFFEREXTRASIZE); 280*3117ece4Schristos ZSTD_memcpy(dctx->litExtraBuffer, istart + lhSize + litSize - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE); 281*3117ece4Schristos } 282*3117ece4Schristos else 283*3117ece4Schristos { 284*3117ece4Schristos ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize); 285*3117ece4Schristos } 286*3117ece4Schristos dctx->litPtr = dctx->litBuffer; 287*3117ece4Schristos dctx->litSize = litSize; 288*3117ece4Schristos return lhSize+litSize; 289*3117ece4Schristos } 290*3117ece4Schristos /* direct reference into compressed stream */ 291*3117ece4Schristos dctx->litPtr = istart+lhSize; 292*3117ece4Schristos dctx->litSize = litSize; 293*3117ece4Schristos dctx->litBufferEnd = dctx->litPtr + litSize; 294*3117ece4Schristos dctx->litBufferLocation = ZSTD_not_in_dst; 295*3117ece4Schristos return lhSize+litSize; 296*3117ece4Schristos } 297*3117ece4Schristos 298*3117ece4Schristos case set_rle: 299*3117ece4Schristos { U32 const lhlCode = ((istart[0]) >> 2) & 3; 300*3117ece4Schristos size_t litSize, lhSize; 301*3117ece4Schristos size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); 302*3117ece4Schristos switch(lhlCode) 303*3117ece4Schristos { 304*3117ece4Schristos case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ 305*3117ece4Schristos lhSize = 1; 306*3117ece4Schristos litSize = istart[0] >> 3; 307*3117ece4Schristos break; 308*3117ece4Schristos case 1: 309*3117ece4Schristos lhSize = 2; 310*3117ece4Schristos RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3"); 311*3117ece4Schristos litSize = MEM_readLE16(istart) >> 4; 312*3117ece4Schristos break; 313*3117ece4Schristos case 3: 314*3117ece4Schristos lhSize = 3; 315*3117ece4Schristos RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4"); 316*3117ece4Schristos litSize = MEM_readLE24(istart) >> 4; 317*3117ece4Schristos break; 318*3117ece4Schristos } 319*3117ece4Schristos RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); 320*3117ece4Schristos RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); 321*3117ece4Schristos RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); 322*3117ece4Schristos ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); 323*3117ece4Schristos if (dctx->litBufferLocation == ZSTD_split) 324*3117ece4Schristos { 325*3117ece4Schristos ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize - ZSTD_LITBUFFEREXTRASIZE); 326*3117ece4Schristos ZSTD_memset(dctx->litExtraBuffer, istart[lhSize], ZSTD_LITBUFFEREXTRASIZE); 327*3117ece4Schristos } 328*3117ece4Schristos else 329*3117ece4Schristos { 330*3117ece4Schristos ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize); 331*3117ece4Schristos } 332*3117ece4Schristos dctx->litPtr = dctx->litBuffer; 333*3117ece4Schristos dctx->litSize = litSize; 334*3117ece4Schristos return lhSize+1; 335*3117ece4Schristos } 336*3117ece4Schristos default: 337*3117ece4Schristos RETURN_ERROR(corruption_detected, "impossible"); 338*3117ece4Schristos } 339*3117ece4Schristos } 340*3117ece4Schristos } 341*3117ece4Schristos 342*3117ece4Schristos /* Hidden declaration for fullbench */ 343*3117ece4Schristos size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx, 344*3117ece4Schristos const void* src, size_t srcSize, 345*3117ece4Schristos void* dst, size_t dstCapacity); 346*3117ece4Schristos size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx, 347*3117ece4Schristos const void* src, size_t srcSize, 348*3117ece4Schristos void* dst, size_t dstCapacity) 349*3117ece4Schristos { 350*3117ece4Schristos dctx->isFrameDecompression = 0; 351*3117ece4Schristos return ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, not_streaming); 352*3117ece4Schristos } 353*3117ece4Schristos 354*3117ece4Schristos /* Default FSE distribution tables. 355*3117ece4Schristos * These are pre-calculated FSE decoding tables using default distributions as defined in specification : 356*3117ece4Schristos * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions 357*3117ece4Schristos * They were generated programmatically with following method : 358*3117ece4Schristos * - start from default distributions, present in /lib/common/zstd_internal.h 359*3117ece4Schristos * - generate tables normally, using ZSTD_buildFSETable() 360*3117ece4Schristos * - printout the content of tables 361*3117ece4Schristos * - pretify output, report below, test with fuzzer to ensure it's correct */ 362*3117ece4Schristos 363*3117ece4Schristos /* Default FSE distribution table for Literal Lengths */ 364*3117ece4Schristos static const ZSTD_seqSymbol LL_defaultDTable[(1<<LL_DEFAULTNORMLOG)+1] = { 365*3117ece4Schristos { 1, 1, 1, LL_DEFAULTNORMLOG}, /* header : fastMode, tableLog */ 366*3117ece4Schristos /* nextState, nbAddBits, nbBits, baseVal */ 367*3117ece4Schristos { 0, 0, 4, 0}, { 16, 0, 4, 0}, 368*3117ece4Schristos { 32, 0, 5, 1}, { 0, 0, 5, 3}, 369*3117ece4Schristos { 0, 0, 5, 4}, { 0, 0, 5, 6}, 370*3117ece4Schristos { 0, 0, 5, 7}, { 0, 0, 5, 9}, 371*3117ece4Schristos { 0, 0, 5, 10}, { 0, 0, 5, 12}, 372*3117ece4Schristos { 0, 0, 6, 14}, { 0, 1, 5, 16}, 373*3117ece4Schristos { 0, 1, 5, 20}, { 0, 1, 5, 22}, 374*3117ece4Schristos { 0, 2, 5, 28}, { 0, 3, 5, 32}, 375*3117ece4Schristos { 0, 4, 5, 48}, { 32, 6, 5, 64}, 376*3117ece4Schristos { 0, 7, 5, 128}, { 0, 8, 6, 256}, 377*3117ece4Schristos { 0, 10, 6, 1024}, { 0, 12, 6, 4096}, 378*3117ece4Schristos { 32, 0, 4, 0}, { 0, 0, 4, 1}, 379*3117ece4Schristos { 0, 0, 5, 2}, { 32, 0, 5, 4}, 380*3117ece4Schristos { 0, 0, 5, 5}, { 32, 0, 5, 7}, 381*3117ece4Schristos { 0, 0, 5, 8}, { 32, 0, 5, 10}, 382*3117ece4Schristos { 0, 0, 5, 11}, { 0, 0, 6, 13}, 383*3117ece4Schristos { 32, 1, 5, 16}, { 0, 1, 5, 18}, 384*3117ece4Schristos { 32, 1, 5, 22}, { 0, 2, 5, 24}, 385*3117ece4Schristos { 32, 3, 5, 32}, { 0, 3, 5, 40}, 386*3117ece4Schristos { 0, 6, 4, 64}, { 16, 6, 4, 64}, 387*3117ece4Schristos { 32, 7, 5, 128}, { 0, 9, 6, 512}, 388*3117ece4Schristos { 0, 11, 6, 2048}, { 48, 0, 4, 0}, 389*3117ece4Schristos { 16, 0, 4, 1}, { 32, 0, 5, 2}, 390*3117ece4Schristos { 32, 0, 5, 3}, { 32, 0, 5, 5}, 391*3117ece4Schristos { 32, 0, 5, 6}, { 32, 0, 5, 8}, 392*3117ece4Schristos { 32, 0, 5, 9}, { 32, 0, 5, 11}, 393*3117ece4Schristos { 32, 0, 5, 12}, { 0, 0, 6, 15}, 394*3117ece4Schristos { 32, 1, 5, 18}, { 32, 1, 5, 20}, 395*3117ece4Schristos { 32, 2, 5, 24}, { 32, 2, 5, 28}, 396*3117ece4Schristos { 32, 3, 5, 40}, { 32, 4, 5, 48}, 397*3117ece4Schristos { 0, 16, 6,65536}, { 0, 15, 6,32768}, 398*3117ece4Schristos { 0, 14, 6,16384}, { 0, 13, 6, 8192}, 399*3117ece4Schristos }; /* LL_defaultDTable */ 400*3117ece4Schristos 401*3117ece4Schristos /* Default FSE distribution table for Offset Codes */ 402*3117ece4Schristos static const ZSTD_seqSymbol OF_defaultDTable[(1<<OF_DEFAULTNORMLOG)+1] = { 403*3117ece4Schristos { 1, 1, 1, OF_DEFAULTNORMLOG}, /* header : fastMode, tableLog */ 404*3117ece4Schristos /* nextState, nbAddBits, nbBits, baseVal */ 405*3117ece4Schristos { 0, 0, 5, 0}, { 0, 6, 4, 61}, 406*3117ece4Schristos { 0, 9, 5, 509}, { 0, 15, 5,32765}, 407*3117ece4Schristos { 0, 21, 5,2097149}, { 0, 3, 5, 5}, 408*3117ece4Schristos { 0, 7, 4, 125}, { 0, 12, 5, 4093}, 409*3117ece4Schristos { 0, 18, 5,262141}, { 0, 23, 5,8388605}, 410*3117ece4Schristos { 0, 5, 5, 29}, { 0, 8, 4, 253}, 411*3117ece4Schristos { 0, 14, 5,16381}, { 0, 20, 5,1048573}, 412*3117ece4Schristos { 0, 2, 5, 1}, { 16, 7, 4, 125}, 413*3117ece4Schristos { 0, 11, 5, 2045}, { 0, 17, 5,131069}, 414*3117ece4Schristos { 0, 22, 5,4194301}, { 0, 4, 5, 13}, 415*3117ece4Schristos { 16, 8, 4, 253}, { 0, 13, 5, 8189}, 416*3117ece4Schristos { 0, 19, 5,524285}, { 0, 1, 5, 1}, 417*3117ece4Schristos { 16, 6, 4, 61}, { 0, 10, 5, 1021}, 418*3117ece4Schristos { 0, 16, 5,65533}, { 0, 28, 5,268435453}, 419*3117ece4Schristos { 0, 27, 5,134217725}, { 0, 26, 5,67108861}, 420*3117ece4Schristos { 0, 25, 5,33554429}, { 0, 24, 5,16777213}, 421*3117ece4Schristos }; /* OF_defaultDTable */ 422*3117ece4Schristos 423*3117ece4Schristos 424*3117ece4Schristos /* Default FSE distribution table for Match Lengths */ 425*3117ece4Schristos static const ZSTD_seqSymbol ML_defaultDTable[(1<<ML_DEFAULTNORMLOG)+1] = { 426*3117ece4Schristos { 1, 1, 1, ML_DEFAULTNORMLOG}, /* header : fastMode, tableLog */ 427*3117ece4Schristos /* nextState, nbAddBits, nbBits, baseVal */ 428*3117ece4Schristos { 0, 0, 6, 3}, { 0, 0, 4, 4}, 429*3117ece4Schristos { 32, 0, 5, 5}, { 0, 0, 5, 6}, 430*3117ece4Schristos { 0, 0, 5, 8}, { 0, 0, 5, 9}, 431*3117ece4Schristos { 0, 0, 5, 11}, { 0, 0, 6, 13}, 432*3117ece4Schristos { 0, 0, 6, 16}, { 0, 0, 6, 19}, 433*3117ece4Schristos { 0, 0, 6, 22}, { 0, 0, 6, 25}, 434*3117ece4Schristos { 0, 0, 6, 28}, { 0, 0, 6, 31}, 435*3117ece4Schristos { 0, 0, 6, 34}, { 0, 1, 6, 37}, 436*3117ece4Schristos { 0, 1, 6, 41}, { 0, 2, 6, 47}, 437*3117ece4Schristos { 0, 3, 6, 59}, { 0, 4, 6, 83}, 438*3117ece4Schristos { 0, 7, 6, 131}, { 0, 9, 6, 515}, 439*3117ece4Schristos { 16, 0, 4, 4}, { 0, 0, 4, 5}, 440*3117ece4Schristos { 32, 0, 5, 6}, { 0, 0, 5, 7}, 441*3117ece4Schristos { 32, 0, 5, 9}, { 0, 0, 5, 10}, 442*3117ece4Schristos { 0, 0, 6, 12}, { 0, 0, 6, 15}, 443*3117ece4Schristos { 0, 0, 6, 18}, { 0, 0, 6, 21}, 444*3117ece4Schristos { 0, 0, 6, 24}, { 0, 0, 6, 27}, 445*3117ece4Schristos { 0, 0, 6, 30}, { 0, 0, 6, 33}, 446*3117ece4Schristos { 0, 1, 6, 35}, { 0, 1, 6, 39}, 447*3117ece4Schristos { 0, 2, 6, 43}, { 0, 3, 6, 51}, 448*3117ece4Schristos { 0, 4, 6, 67}, { 0, 5, 6, 99}, 449*3117ece4Schristos { 0, 8, 6, 259}, { 32, 0, 4, 4}, 450*3117ece4Schristos { 48, 0, 4, 4}, { 16, 0, 4, 5}, 451*3117ece4Schristos { 32, 0, 5, 7}, { 32, 0, 5, 8}, 452*3117ece4Schristos { 32, 0, 5, 10}, { 32, 0, 5, 11}, 453*3117ece4Schristos { 0, 0, 6, 14}, { 0, 0, 6, 17}, 454*3117ece4Schristos { 0, 0, 6, 20}, { 0, 0, 6, 23}, 455*3117ece4Schristos { 0, 0, 6, 26}, { 0, 0, 6, 29}, 456*3117ece4Schristos { 0, 0, 6, 32}, { 0, 16, 6,65539}, 457*3117ece4Schristos { 0, 15, 6,32771}, { 0, 14, 6,16387}, 458*3117ece4Schristos { 0, 13, 6, 8195}, { 0, 12, 6, 4099}, 459*3117ece4Schristos { 0, 11, 6, 2051}, { 0, 10, 6, 1027}, 460*3117ece4Schristos }; /* ML_defaultDTable */ 461*3117ece4Schristos 462*3117ece4Schristos 463*3117ece4Schristos static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U8 nbAddBits) 464*3117ece4Schristos { 465*3117ece4Schristos void* ptr = dt; 466*3117ece4Schristos ZSTD_seqSymbol_header* const DTableH = (ZSTD_seqSymbol_header*)ptr; 467*3117ece4Schristos ZSTD_seqSymbol* const cell = dt + 1; 468*3117ece4Schristos 469*3117ece4Schristos DTableH->tableLog = 0; 470*3117ece4Schristos DTableH->fastMode = 0; 471*3117ece4Schristos 472*3117ece4Schristos cell->nbBits = 0; 473*3117ece4Schristos cell->nextState = 0; 474*3117ece4Schristos assert(nbAddBits < 255); 475*3117ece4Schristos cell->nbAdditionalBits = nbAddBits; 476*3117ece4Schristos cell->baseValue = baseValue; 477*3117ece4Schristos } 478*3117ece4Schristos 479*3117ece4Schristos 480*3117ece4Schristos /* ZSTD_buildFSETable() : 481*3117ece4Schristos * generate FSE decoding table for one symbol (ll, ml or off) 482*3117ece4Schristos * cannot fail if input is valid => 483*3117ece4Schristos * all inputs are presumed validated at this stage */ 484*3117ece4Schristos FORCE_INLINE_TEMPLATE 485*3117ece4Schristos void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, 486*3117ece4Schristos const short* normalizedCounter, unsigned maxSymbolValue, 487*3117ece4Schristos const U32* baseValue, const U8* nbAdditionalBits, 488*3117ece4Schristos unsigned tableLog, void* wksp, size_t wkspSize) 489*3117ece4Schristos { 490*3117ece4Schristos ZSTD_seqSymbol* const tableDecode = dt+1; 491*3117ece4Schristos U32 const maxSV1 = maxSymbolValue + 1; 492*3117ece4Schristos U32 const tableSize = 1 << tableLog; 493*3117ece4Schristos 494*3117ece4Schristos U16* symbolNext = (U16*)wksp; 495*3117ece4Schristos BYTE* spread = (BYTE*)(symbolNext + MaxSeq + 1); 496*3117ece4Schristos U32 highThreshold = tableSize - 1; 497*3117ece4Schristos 498*3117ece4Schristos 499*3117ece4Schristos /* Sanity Checks */ 500*3117ece4Schristos assert(maxSymbolValue <= MaxSeq); 501*3117ece4Schristos assert(tableLog <= MaxFSELog); 502*3117ece4Schristos assert(wkspSize >= ZSTD_BUILD_FSE_TABLE_WKSP_SIZE); 503*3117ece4Schristos (void)wkspSize; 504*3117ece4Schristos /* Init, lay down lowprob symbols */ 505*3117ece4Schristos { ZSTD_seqSymbol_header DTableH; 506*3117ece4Schristos DTableH.tableLog = tableLog; 507*3117ece4Schristos DTableH.fastMode = 1; 508*3117ece4Schristos { S16 const largeLimit= (S16)(1 << (tableLog-1)); 509*3117ece4Schristos U32 s; 510*3117ece4Schristos for (s=0; s<maxSV1; s++) { 511*3117ece4Schristos if (normalizedCounter[s]==-1) { 512*3117ece4Schristos tableDecode[highThreshold--].baseValue = s; 513*3117ece4Schristos symbolNext[s] = 1; 514*3117ece4Schristos } else { 515*3117ece4Schristos if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0; 516*3117ece4Schristos assert(normalizedCounter[s]>=0); 517*3117ece4Schristos symbolNext[s] = (U16)normalizedCounter[s]; 518*3117ece4Schristos } } } 519*3117ece4Schristos ZSTD_memcpy(dt, &DTableH, sizeof(DTableH)); 520*3117ece4Schristos } 521*3117ece4Schristos 522*3117ece4Schristos /* Spread symbols */ 523*3117ece4Schristos assert(tableSize <= 512); 524*3117ece4Schristos /* Specialized symbol spreading for the case when there are 525*3117ece4Schristos * no low probability (-1 count) symbols. When compressing 526*3117ece4Schristos * small blocks we avoid low probability symbols to hit this 527*3117ece4Schristos * case, since header decoding speed matters more. 528*3117ece4Schristos */ 529*3117ece4Schristos if (highThreshold == tableSize - 1) { 530*3117ece4Schristos size_t const tableMask = tableSize-1; 531*3117ece4Schristos size_t const step = FSE_TABLESTEP(tableSize); 532*3117ece4Schristos /* First lay down the symbols in order. 533*3117ece4Schristos * We use a uint64_t to lay down 8 bytes at a time. This reduces branch 534*3117ece4Schristos * misses since small blocks generally have small table logs, so nearly 535*3117ece4Schristos * all symbols have counts <= 8. We ensure we have 8 bytes at the end of 536*3117ece4Schristos * our buffer to handle the over-write. 537*3117ece4Schristos */ 538*3117ece4Schristos { 539*3117ece4Schristos U64 const add = 0x0101010101010101ull; 540*3117ece4Schristos size_t pos = 0; 541*3117ece4Schristos U64 sv = 0; 542*3117ece4Schristos U32 s; 543*3117ece4Schristos for (s=0; s<maxSV1; ++s, sv += add) { 544*3117ece4Schristos int i; 545*3117ece4Schristos int const n = normalizedCounter[s]; 546*3117ece4Schristos MEM_write64(spread + pos, sv); 547*3117ece4Schristos for (i = 8; i < n; i += 8) { 548*3117ece4Schristos MEM_write64(spread + pos + i, sv); 549*3117ece4Schristos } 550*3117ece4Schristos assert(n>=0); 551*3117ece4Schristos pos += (size_t)n; 552*3117ece4Schristos } 553*3117ece4Schristos } 554*3117ece4Schristos /* Now we spread those positions across the table. 555*3117ece4Schristos * The benefit of doing it in two stages is that we avoid the 556*3117ece4Schristos * variable size inner loop, which caused lots of branch misses. 557*3117ece4Schristos * Now we can run through all the positions without any branch misses. 558*3117ece4Schristos * We unroll the loop twice, since that is what empirically worked best. 559*3117ece4Schristos */ 560*3117ece4Schristos { 561*3117ece4Schristos size_t position = 0; 562*3117ece4Schristos size_t s; 563*3117ece4Schristos size_t const unroll = 2; 564*3117ece4Schristos assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */ 565*3117ece4Schristos for (s = 0; s < (size_t)tableSize; s += unroll) { 566*3117ece4Schristos size_t u; 567*3117ece4Schristos for (u = 0; u < unroll; ++u) { 568*3117ece4Schristos size_t const uPosition = (position + (u * step)) & tableMask; 569*3117ece4Schristos tableDecode[uPosition].baseValue = spread[s + u]; 570*3117ece4Schristos } 571*3117ece4Schristos position = (position + (unroll * step)) & tableMask; 572*3117ece4Schristos } 573*3117ece4Schristos assert(position == 0); 574*3117ece4Schristos } 575*3117ece4Schristos } else { 576*3117ece4Schristos U32 const tableMask = tableSize-1; 577*3117ece4Schristos U32 const step = FSE_TABLESTEP(tableSize); 578*3117ece4Schristos U32 s, position = 0; 579*3117ece4Schristos for (s=0; s<maxSV1; s++) { 580*3117ece4Schristos int i; 581*3117ece4Schristos int const n = normalizedCounter[s]; 582*3117ece4Schristos for (i=0; i<n; i++) { 583*3117ece4Schristos tableDecode[position].baseValue = s; 584*3117ece4Schristos position = (position + step) & tableMask; 585*3117ece4Schristos while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask; /* lowprob area */ 586*3117ece4Schristos } } 587*3117ece4Schristos assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */ 588*3117ece4Schristos } 589*3117ece4Schristos 590*3117ece4Schristos /* Build Decoding table */ 591*3117ece4Schristos { 592*3117ece4Schristos U32 u; 593*3117ece4Schristos for (u=0; u<tableSize; u++) { 594*3117ece4Schristos U32 const symbol = tableDecode[u].baseValue; 595*3117ece4Schristos U32 const nextState = symbolNext[symbol]++; 596*3117ece4Schristos tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) ); 597*3117ece4Schristos tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize); 598*3117ece4Schristos assert(nbAdditionalBits[symbol] < 255); 599*3117ece4Schristos tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol]; 600*3117ece4Schristos tableDecode[u].baseValue = baseValue[symbol]; 601*3117ece4Schristos } 602*3117ece4Schristos } 603*3117ece4Schristos } 604*3117ece4Schristos 605*3117ece4Schristos /* Avoids the FORCE_INLINE of the _body() function. */ 606*3117ece4Schristos static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt, 607*3117ece4Schristos const short* normalizedCounter, unsigned maxSymbolValue, 608*3117ece4Schristos const U32* baseValue, const U8* nbAdditionalBits, 609*3117ece4Schristos unsigned tableLog, void* wksp, size_t wkspSize) 610*3117ece4Schristos { 611*3117ece4Schristos ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue, 612*3117ece4Schristos baseValue, nbAdditionalBits, tableLog, wksp, wkspSize); 613*3117ece4Schristos } 614*3117ece4Schristos 615*3117ece4Schristos #if DYNAMIC_BMI2 616*3117ece4Schristos BMI2_TARGET_ATTRIBUTE static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt, 617*3117ece4Schristos const short* normalizedCounter, unsigned maxSymbolValue, 618*3117ece4Schristos const U32* baseValue, const U8* nbAdditionalBits, 619*3117ece4Schristos unsigned tableLog, void* wksp, size_t wkspSize) 620*3117ece4Schristos { 621*3117ece4Schristos ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue, 622*3117ece4Schristos baseValue, nbAdditionalBits, tableLog, wksp, wkspSize); 623*3117ece4Schristos } 624*3117ece4Schristos #endif 625*3117ece4Schristos 626*3117ece4Schristos void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, 627*3117ece4Schristos const short* normalizedCounter, unsigned maxSymbolValue, 628*3117ece4Schristos const U32* baseValue, const U8* nbAdditionalBits, 629*3117ece4Schristos unsigned tableLog, void* wksp, size_t wkspSize, int bmi2) 630*3117ece4Schristos { 631*3117ece4Schristos #if DYNAMIC_BMI2 632*3117ece4Schristos if (bmi2) { 633*3117ece4Schristos ZSTD_buildFSETable_body_bmi2(dt, normalizedCounter, maxSymbolValue, 634*3117ece4Schristos baseValue, nbAdditionalBits, tableLog, wksp, wkspSize); 635*3117ece4Schristos return; 636*3117ece4Schristos } 637*3117ece4Schristos #endif 638*3117ece4Schristos (void)bmi2; 639*3117ece4Schristos ZSTD_buildFSETable_body_default(dt, normalizedCounter, maxSymbolValue, 640*3117ece4Schristos baseValue, nbAdditionalBits, tableLog, wksp, wkspSize); 641*3117ece4Schristos } 642*3117ece4Schristos 643*3117ece4Schristos 644*3117ece4Schristos /*! ZSTD_buildSeqTable() : 645*3117ece4Schristos * @return : nb bytes read from src, 646*3117ece4Schristos * or an error code if it fails */ 647*3117ece4Schristos static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr, 648*3117ece4Schristos symbolEncodingType_e type, unsigned max, U32 maxLog, 649*3117ece4Schristos const void* src, size_t srcSize, 650*3117ece4Schristos const U32* baseValue, const U8* nbAdditionalBits, 651*3117ece4Schristos const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable, 652*3117ece4Schristos int ddictIsCold, int nbSeq, U32* wksp, size_t wkspSize, 653*3117ece4Schristos int bmi2) 654*3117ece4Schristos { 655*3117ece4Schristos switch(type) 656*3117ece4Schristos { 657*3117ece4Schristos case set_rle : 658*3117ece4Schristos RETURN_ERROR_IF(!srcSize, srcSize_wrong, ""); 659*3117ece4Schristos RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected, ""); 660*3117ece4Schristos { U32 const symbol = *(const BYTE*)src; 661*3117ece4Schristos U32 const baseline = baseValue[symbol]; 662*3117ece4Schristos U8 const nbBits = nbAdditionalBits[symbol]; 663*3117ece4Schristos ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits); 664*3117ece4Schristos } 665*3117ece4Schristos *DTablePtr = DTableSpace; 666*3117ece4Schristos return 1; 667*3117ece4Schristos case set_basic : 668*3117ece4Schristos *DTablePtr = defaultTable; 669*3117ece4Schristos return 0; 670*3117ece4Schristos case set_repeat: 671*3117ece4Schristos RETURN_ERROR_IF(!flagRepeatTable, corruption_detected, ""); 672*3117ece4Schristos /* prefetch FSE table if used */ 673*3117ece4Schristos if (ddictIsCold && (nbSeq > 24 /* heuristic */)) { 674*3117ece4Schristos const void* const pStart = *DTablePtr; 675*3117ece4Schristos size_t const pSize = sizeof(ZSTD_seqSymbol) * (SEQSYMBOL_TABLE_SIZE(maxLog)); 676*3117ece4Schristos PREFETCH_AREA(pStart, pSize); 677*3117ece4Schristos } 678*3117ece4Schristos return 0; 679*3117ece4Schristos case set_compressed : 680*3117ece4Schristos { unsigned tableLog; 681*3117ece4Schristos S16 norm[MaxSeq+1]; 682*3117ece4Schristos size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize); 683*3117ece4Schristos RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected, ""); 684*3117ece4Schristos RETURN_ERROR_IF(tableLog > maxLog, corruption_detected, ""); 685*3117ece4Schristos ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog, wksp, wkspSize, bmi2); 686*3117ece4Schristos *DTablePtr = DTableSpace; 687*3117ece4Schristos return headerSize; 688*3117ece4Schristos } 689*3117ece4Schristos default : 690*3117ece4Schristos assert(0); 691*3117ece4Schristos RETURN_ERROR(GENERIC, "impossible"); 692*3117ece4Schristos } 693*3117ece4Schristos } 694*3117ece4Schristos 695*3117ece4Schristos size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, 696*3117ece4Schristos const void* src, size_t srcSize) 697*3117ece4Schristos { 698*3117ece4Schristos const BYTE* const istart = (const BYTE*)src; 699*3117ece4Schristos const BYTE* const iend = istart + srcSize; 700*3117ece4Schristos const BYTE* ip = istart; 701*3117ece4Schristos int nbSeq; 702*3117ece4Schristos DEBUGLOG(5, "ZSTD_decodeSeqHeaders"); 703*3117ece4Schristos 704*3117ece4Schristos /* check */ 705*3117ece4Schristos RETURN_ERROR_IF(srcSize < MIN_SEQUENCES_SIZE, srcSize_wrong, ""); 706*3117ece4Schristos 707*3117ece4Schristos /* SeqHead */ 708*3117ece4Schristos nbSeq = *ip++; 709*3117ece4Schristos if (nbSeq > 0x7F) { 710*3117ece4Schristos if (nbSeq == 0xFF) { 711*3117ece4Schristos RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, ""); 712*3117ece4Schristos nbSeq = MEM_readLE16(ip) + LONGNBSEQ; 713*3117ece4Schristos ip+=2; 714*3117ece4Schristos } else { 715*3117ece4Schristos RETURN_ERROR_IF(ip >= iend, srcSize_wrong, ""); 716*3117ece4Schristos nbSeq = ((nbSeq-0x80)<<8) + *ip++; 717*3117ece4Schristos } 718*3117ece4Schristos } 719*3117ece4Schristos *nbSeqPtr = nbSeq; 720*3117ece4Schristos 721*3117ece4Schristos if (nbSeq == 0) { 722*3117ece4Schristos /* No sequence : section ends immediately */ 723*3117ece4Schristos RETURN_ERROR_IF(ip != iend, corruption_detected, 724*3117ece4Schristos "extraneous data present in the Sequences section"); 725*3117ece4Schristos return (size_t)(ip - istart); 726*3117ece4Schristos } 727*3117ece4Schristos 728*3117ece4Schristos /* FSE table descriptors */ 729*3117ece4Schristos RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */ 730*3117ece4Schristos RETURN_ERROR_IF(*ip & 3, corruption_detected, ""); /* The last field, Reserved, must be all-zeroes. */ 731*3117ece4Schristos { symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6); 732*3117ece4Schristos symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3); 733*3117ece4Schristos symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3); 734*3117ece4Schristos ip++; 735*3117ece4Schristos 736*3117ece4Schristos /* Build DTables */ 737*3117ece4Schristos { size_t const llhSize = ZSTD_buildSeqTable(dctx->entropy.LLTable, &dctx->LLTptr, 738*3117ece4Schristos LLtype, MaxLL, LLFSELog, 739*3117ece4Schristos ip, iend-ip, 740*3117ece4Schristos LL_base, LL_bits, 741*3117ece4Schristos LL_defaultDTable, dctx->fseEntropy, 742*3117ece4Schristos dctx->ddictIsCold, nbSeq, 743*3117ece4Schristos dctx->workspace, sizeof(dctx->workspace), 744*3117ece4Schristos ZSTD_DCtx_get_bmi2(dctx)); 745*3117ece4Schristos RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed"); 746*3117ece4Schristos ip += llhSize; 747*3117ece4Schristos } 748*3117ece4Schristos 749*3117ece4Schristos { size_t const ofhSize = ZSTD_buildSeqTable(dctx->entropy.OFTable, &dctx->OFTptr, 750*3117ece4Schristos OFtype, MaxOff, OffFSELog, 751*3117ece4Schristos ip, iend-ip, 752*3117ece4Schristos OF_base, OF_bits, 753*3117ece4Schristos OF_defaultDTable, dctx->fseEntropy, 754*3117ece4Schristos dctx->ddictIsCold, nbSeq, 755*3117ece4Schristos dctx->workspace, sizeof(dctx->workspace), 756*3117ece4Schristos ZSTD_DCtx_get_bmi2(dctx)); 757*3117ece4Schristos RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed"); 758*3117ece4Schristos ip += ofhSize; 759*3117ece4Schristos } 760*3117ece4Schristos 761*3117ece4Schristos { size_t const mlhSize = ZSTD_buildSeqTable(dctx->entropy.MLTable, &dctx->MLTptr, 762*3117ece4Schristos MLtype, MaxML, MLFSELog, 763*3117ece4Schristos ip, iend-ip, 764*3117ece4Schristos ML_base, ML_bits, 765*3117ece4Schristos ML_defaultDTable, dctx->fseEntropy, 766*3117ece4Schristos dctx->ddictIsCold, nbSeq, 767*3117ece4Schristos dctx->workspace, sizeof(dctx->workspace), 768*3117ece4Schristos ZSTD_DCtx_get_bmi2(dctx)); 769*3117ece4Schristos RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed"); 770*3117ece4Schristos ip += mlhSize; 771*3117ece4Schristos } 772*3117ece4Schristos } 773*3117ece4Schristos 774*3117ece4Schristos return ip-istart; 775*3117ece4Schristos } 776*3117ece4Schristos 777*3117ece4Schristos 778*3117ece4Schristos typedef struct { 779*3117ece4Schristos size_t litLength; 780*3117ece4Schristos size_t matchLength; 781*3117ece4Schristos size_t offset; 782*3117ece4Schristos } seq_t; 783*3117ece4Schristos 784*3117ece4Schristos typedef struct { 785*3117ece4Schristos size_t state; 786*3117ece4Schristos const ZSTD_seqSymbol* table; 787*3117ece4Schristos } ZSTD_fseState; 788*3117ece4Schristos 789*3117ece4Schristos typedef struct { 790*3117ece4Schristos BIT_DStream_t DStream; 791*3117ece4Schristos ZSTD_fseState stateLL; 792*3117ece4Schristos ZSTD_fseState stateOffb; 793*3117ece4Schristos ZSTD_fseState stateML; 794*3117ece4Schristos size_t prevOffset[ZSTD_REP_NUM]; 795*3117ece4Schristos } seqState_t; 796*3117ece4Schristos 797*3117ece4Schristos /*! ZSTD_overlapCopy8() : 798*3117ece4Schristos * Copies 8 bytes from ip to op and updates op and ip where ip <= op. 799*3117ece4Schristos * If the offset is < 8 then the offset is spread to at least 8 bytes. 800*3117ece4Schristos * 801*3117ece4Schristos * Precondition: *ip <= *op 802*3117ece4Schristos * Postcondition: *op - *op >= 8 803*3117ece4Schristos */ 804*3117ece4Schristos HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) { 805*3117ece4Schristos assert(*ip <= *op); 806*3117ece4Schristos if (offset < 8) { 807*3117ece4Schristos /* close range match, overlap */ 808*3117ece4Schristos static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; /* added */ 809*3117ece4Schristos static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 }; /* subtracted */ 810*3117ece4Schristos int const sub2 = dec64table[offset]; 811*3117ece4Schristos (*op)[0] = (*ip)[0]; 812*3117ece4Schristos (*op)[1] = (*ip)[1]; 813*3117ece4Schristos (*op)[2] = (*ip)[2]; 814*3117ece4Schristos (*op)[3] = (*ip)[3]; 815*3117ece4Schristos *ip += dec32table[offset]; 816*3117ece4Schristos ZSTD_copy4(*op+4, *ip); 817*3117ece4Schristos *ip -= sub2; 818*3117ece4Schristos } else { 819*3117ece4Schristos ZSTD_copy8(*op, *ip); 820*3117ece4Schristos } 821*3117ece4Schristos *ip += 8; 822*3117ece4Schristos *op += 8; 823*3117ece4Schristos assert(*op - *ip >= 8); 824*3117ece4Schristos } 825*3117ece4Schristos 826*3117ece4Schristos /*! ZSTD_safecopy() : 827*3117ece4Schristos * Specialized version of memcpy() that is allowed to READ up to WILDCOPY_OVERLENGTH past the input buffer 828*3117ece4Schristos * and write up to 16 bytes past oend_w (op >= oend_w is allowed). 829*3117ece4Schristos * This function is only called in the uncommon case where the sequence is near the end of the block. It 830*3117ece4Schristos * should be fast for a single long sequence, but can be slow for several short sequences. 831*3117ece4Schristos * 832*3117ece4Schristos * @param ovtype controls the overlap detection 833*3117ece4Schristos * - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart. 834*3117ece4Schristos * - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart. 835*3117ece4Schristos * The src buffer must be before the dst buffer. 836*3117ece4Schristos */ 837*3117ece4Schristos static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) { 838*3117ece4Schristos ptrdiff_t const diff = op - ip; 839*3117ece4Schristos BYTE* const oend = op + length; 840*3117ece4Schristos 841*3117ece4Schristos assert((ovtype == ZSTD_no_overlap && (diff <= -8 || diff >= 8 || op >= oend_w)) || 842*3117ece4Schristos (ovtype == ZSTD_overlap_src_before_dst && diff >= 0)); 843*3117ece4Schristos 844*3117ece4Schristos if (length < 8) { 845*3117ece4Schristos /* Handle short lengths. */ 846*3117ece4Schristos while (op < oend) *op++ = *ip++; 847*3117ece4Schristos return; 848*3117ece4Schristos } 849*3117ece4Schristos if (ovtype == ZSTD_overlap_src_before_dst) { 850*3117ece4Schristos /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */ 851*3117ece4Schristos assert(length >= 8); 852*3117ece4Schristos ZSTD_overlapCopy8(&op, &ip, diff); 853*3117ece4Schristos length -= 8; 854*3117ece4Schristos assert(op - ip >= 8); 855*3117ece4Schristos assert(op <= oend); 856*3117ece4Schristos } 857*3117ece4Schristos 858*3117ece4Schristos if (oend <= oend_w) { 859*3117ece4Schristos /* No risk of overwrite. */ 860*3117ece4Schristos ZSTD_wildcopy(op, ip, length, ovtype); 861*3117ece4Schristos return; 862*3117ece4Schristos } 863*3117ece4Schristos if (op <= oend_w) { 864*3117ece4Schristos /* Wildcopy until we get close to the end. */ 865*3117ece4Schristos assert(oend > oend_w); 866*3117ece4Schristos ZSTD_wildcopy(op, ip, oend_w - op, ovtype); 867*3117ece4Schristos ip += oend_w - op; 868*3117ece4Schristos op += oend_w - op; 869*3117ece4Schristos } 870*3117ece4Schristos /* Handle the leftovers. */ 871*3117ece4Schristos while (op < oend) *op++ = *ip++; 872*3117ece4Schristos } 873*3117ece4Schristos 874*3117ece4Schristos /* ZSTD_safecopyDstBeforeSrc(): 875*3117ece4Schristos * This version allows overlap with dst before src, or handles the non-overlap case with dst after src 876*3117ece4Schristos * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */ 877*3117ece4Schristos static void ZSTD_safecopyDstBeforeSrc(BYTE* op, const BYTE* ip, ptrdiff_t length) { 878*3117ece4Schristos ptrdiff_t const diff = op - ip; 879*3117ece4Schristos BYTE* const oend = op + length; 880*3117ece4Schristos 881*3117ece4Schristos if (length < 8 || diff > -8) { 882*3117ece4Schristos /* Handle short lengths, close overlaps, and dst not before src. */ 883*3117ece4Schristos while (op < oend) *op++ = *ip++; 884*3117ece4Schristos return; 885*3117ece4Schristos } 886*3117ece4Schristos 887*3117ece4Schristos if (op <= oend - WILDCOPY_OVERLENGTH && diff < -WILDCOPY_VECLEN) { 888*3117ece4Schristos ZSTD_wildcopy(op, ip, oend - WILDCOPY_OVERLENGTH - op, ZSTD_no_overlap); 889*3117ece4Schristos ip += oend - WILDCOPY_OVERLENGTH - op; 890*3117ece4Schristos op += oend - WILDCOPY_OVERLENGTH - op; 891*3117ece4Schristos } 892*3117ece4Schristos 893*3117ece4Schristos /* Handle the leftovers. */ 894*3117ece4Schristos while (op < oend) *op++ = *ip++; 895*3117ece4Schristos } 896*3117ece4Schristos 897*3117ece4Schristos /* ZSTD_execSequenceEnd(): 898*3117ece4Schristos * This version handles cases that are near the end of the output buffer. It requires 899*3117ece4Schristos * more careful checks to make sure there is no overflow. By separating out these hard 900*3117ece4Schristos * and unlikely cases, we can speed up the common cases. 901*3117ece4Schristos * 902*3117ece4Schristos * NOTE: This function needs to be fast for a single long sequence, but doesn't need 903*3117ece4Schristos * to be optimized for many small sequences, since those fall into ZSTD_execSequence(). 904*3117ece4Schristos */ 905*3117ece4Schristos FORCE_NOINLINE 906*3117ece4Schristos ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 907*3117ece4Schristos size_t ZSTD_execSequenceEnd(BYTE* op, 908*3117ece4Schristos BYTE* const oend, seq_t sequence, 909*3117ece4Schristos const BYTE** litPtr, const BYTE* const litLimit, 910*3117ece4Schristos const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) 911*3117ece4Schristos { 912*3117ece4Schristos BYTE* const oLitEnd = op + sequence.litLength; 913*3117ece4Schristos size_t const sequenceLength = sequence.litLength + sequence.matchLength; 914*3117ece4Schristos const BYTE* const iLitEnd = *litPtr + sequence.litLength; 915*3117ece4Schristos const BYTE* match = oLitEnd - sequence.offset; 916*3117ece4Schristos BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; 917*3117ece4Schristos 918*3117ece4Schristos /* bounds checks : careful of address space overflow in 32-bit mode */ 919*3117ece4Schristos RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer"); 920*3117ece4Schristos RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer"); 921*3117ece4Schristos assert(op < op + sequenceLength); 922*3117ece4Schristos assert(oLitEnd < op + sequenceLength); 923*3117ece4Schristos 924*3117ece4Schristos /* copy literals */ 925*3117ece4Schristos ZSTD_safecopy(op, oend_w, *litPtr, sequence.litLength, ZSTD_no_overlap); 926*3117ece4Schristos op = oLitEnd; 927*3117ece4Schristos *litPtr = iLitEnd; 928*3117ece4Schristos 929*3117ece4Schristos /* copy Match */ 930*3117ece4Schristos if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { 931*3117ece4Schristos /* offset beyond prefix */ 932*3117ece4Schristos RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, ""); 933*3117ece4Schristos match = dictEnd - (prefixStart - match); 934*3117ece4Schristos if (match + sequence.matchLength <= dictEnd) { 935*3117ece4Schristos ZSTD_memmove(oLitEnd, match, sequence.matchLength); 936*3117ece4Schristos return sequenceLength; 937*3117ece4Schristos } 938*3117ece4Schristos /* span extDict & currentPrefixSegment */ 939*3117ece4Schristos { size_t const length1 = dictEnd - match; 940*3117ece4Schristos ZSTD_memmove(oLitEnd, match, length1); 941*3117ece4Schristos op = oLitEnd + length1; 942*3117ece4Schristos sequence.matchLength -= length1; 943*3117ece4Schristos match = prefixStart; 944*3117ece4Schristos } 945*3117ece4Schristos } 946*3117ece4Schristos ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst); 947*3117ece4Schristos return sequenceLength; 948*3117ece4Schristos } 949*3117ece4Schristos 950*3117ece4Schristos /* ZSTD_execSequenceEndSplitLitBuffer(): 951*3117ece4Schristos * This version is intended to be used during instances where the litBuffer is still split. It is kept separate to avoid performance impact for the good case. 952*3117ece4Schristos */ 953*3117ece4Schristos FORCE_NOINLINE 954*3117ece4Schristos ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 955*3117ece4Schristos size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op, 956*3117ece4Schristos BYTE* const oend, const BYTE* const oend_w, seq_t sequence, 957*3117ece4Schristos const BYTE** litPtr, const BYTE* const litLimit, 958*3117ece4Schristos const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) 959*3117ece4Schristos { 960*3117ece4Schristos BYTE* const oLitEnd = op + sequence.litLength; 961*3117ece4Schristos size_t const sequenceLength = sequence.litLength + sequence.matchLength; 962*3117ece4Schristos const BYTE* const iLitEnd = *litPtr + sequence.litLength; 963*3117ece4Schristos const BYTE* match = oLitEnd - sequence.offset; 964*3117ece4Schristos 965*3117ece4Schristos 966*3117ece4Schristos /* bounds checks : careful of address space overflow in 32-bit mode */ 967*3117ece4Schristos RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer"); 968*3117ece4Schristos RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer"); 969*3117ece4Schristos assert(op < op + sequenceLength); 970*3117ece4Schristos assert(oLitEnd < op + sequenceLength); 971*3117ece4Schristos 972*3117ece4Schristos /* copy literals */ 973*3117ece4Schristos RETURN_ERROR_IF(op > *litPtr && op < *litPtr + sequence.litLength, dstSize_tooSmall, "output should not catch up to and overwrite literal buffer"); 974*3117ece4Schristos ZSTD_safecopyDstBeforeSrc(op, *litPtr, sequence.litLength); 975*3117ece4Schristos op = oLitEnd; 976*3117ece4Schristos *litPtr = iLitEnd; 977*3117ece4Schristos 978*3117ece4Schristos /* copy Match */ 979*3117ece4Schristos if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { 980*3117ece4Schristos /* offset beyond prefix */ 981*3117ece4Schristos RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, ""); 982*3117ece4Schristos match = dictEnd - (prefixStart - match); 983*3117ece4Schristos if (match + sequence.matchLength <= dictEnd) { 984*3117ece4Schristos ZSTD_memmove(oLitEnd, match, sequence.matchLength); 985*3117ece4Schristos return sequenceLength; 986*3117ece4Schristos } 987*3117ece4Schristos /* span extDict & currentPrefixSegment */ 988*3117ece4Schristos { size_t const length1 = dictEnd - match; 989*3117ece4Schristos ZSTD_memmove(oLitEnd, match, length1); 990*3117ece4Schristos op = oLitEnd + length1; 991*3117ece4Schristos sequence.matchLength -= length1; 992*3117ece4Schristos match = prefixStart; 993*3117ece4Schristos } 994*3117ece4Schristos } 995*3117ece4Schristos ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst); 996*3117ece4Schristos return sequenceLength; 997*3117ece4Schristos } 998*3117ece4Schristos 999*3117ece4Schristos HINT_INLINE 1000*3117ece4Schristos ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 1001*3117ece4Schristos size_t ZSTD_execSequence(BYTE* op, 1002*3117ece4Schristos BYTE* const oend, seq_t sequence, 1003*3117ece4Schristos const BYTE** litPtr, const BYTE* const litLimit, 1004*3117ece4Schristos const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) 1005*3117ece4Schristos { 1006*3117ece4Schristos BYTE* const oLitEnd = op + sequence.litLength; 1007*3117ece4Schristos size_t const sequenceLength = sequence.litLength + sequence.matchLength; 1008*3117ece4Schristos BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ 1009*3117ece4Schristos BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; /* risk : address space underflow on oend=NULL */ 1010*3117ece4Schristos const BYTE* const iLitEnd = *litPtr + sequence.litLength; 1011*3117ece4Schristos const BYTE* match = oLitEnd - sequence.offset; 1012*3117ece4Schristos 1013*3117ece4Schristos assert(op != NULL /* Precondition */); 1014*3117ece4Schristos assert(oend_w < oend /* No underflow */); 1015*3117ece4Schristos 1016*3117ece4Schristos #if defined(__aarch64__) 1017*3117ece4Schristos /* prefetch sequence starting from match that will be used for copy later */ 1018*3117ece4Schristos PREFETCH_L1(match); 1019*3117ece4Schristos #endif 1020*3117ece4Schristos /* Handle edge cases in a slow path: 1021*3117ece4Schristos * - Read beyond end of literals 1022*3117ece4Schristos * - Match end is within WILDCOPY_OVERLIMIT of oend 1023*3117ece4Schristos * - 32-bit mode and the match length overflows 1024*3117ece4Schristos */ 1025*3117ece4Schristos if (UNLIKELY( 1026*3117ece4Schristos iLitEnd > litLimit || 1027*3117ece4Schristos oMatchEnd > oend_w || 1028*3117ece4Schristos (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH))) 1029*3117ece4Schristos return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd); 1030*3117ece4Schristos 1031*3117ece4Schristos /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */ 1032*3117ece4Schristos assert(op <= oLitEnd /* No overflow */); 1033*3117ece4Schristos assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */); 1034*3117ece4Schristos assert(oMatchEnd <= oend /* No underflow */); 1035*3117ece4Schristos assert(iLitEnd <= litLimit /* Literal length is in bounds */); 1036*3117ece4Schristos assert(oLitEnd <= oend_w /* Can wildcopy literals */); 1037*3117ece4Schristos assert(oMatchEnd <= oend_w /* Can wildcopy matches */); 1038*3117ece4Schristos 1039*3117ece4Schristos /* Copy Literals: 1040*3117ece4Schristos * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9. 1041*3117ece4Schristos * We likely don't need the full 32-byte wildcopy. 1042*3117ece4Schristos */ 1043*3117ece4Schristos assert(WILDCOPY_OVERLENGTH >= 16); 1044*3117ece4Schristos ZSTD_copy16(op, (*litPtr)); 1045*3117ece4Schristos if (UNLIKELY(sequence.litLength > 16)) { 1046*3117ece4Schristos ZSTD_wildcopy(op + 16, (*litPtr) + 16, sequence.litLength - 16, ZSTD_no_overlap); 1047*3117ece4Schristos } 1048*3117ece4Schristos op = oLitEnd; 1049*3117ece4Schristos *litPtr = iLitEnd; /* update for next sequence */ 1050*3117ece4Schristos 1051*3117ece4Schristos /* Copy Match */ 1052*3117ece4Schristos if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { 1053*3117ece4Schristos /* offset beyond prefix -> go into extDict */ 1054*3117ece4Schristos RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, ""); 1055*3117ece4Schristos match = dictEnd + (match - prefixStart); 1056*3117ece4Schristos if (match + sequence.matchLength <= dictEnd) { 1057*3117ece4Schristos ZSTD_memmove(oLitEnd, match, sequence.matchLength); 1058*3117ece4Schristos return sequenceLength; 1059*3117ece4Schristos } 1060*3117ece4Schristos /* span extDict & currentPrefixSegment */ 1061*3117ece4Schristos { size_t const length1 = dictEnd - match; 1062*3117ece4Schristos ZSTD_memmove(oLitEnd, match, length1); 1063*3117ece4Schristos op = oLitEnd + length1; 1064*3117ece4Schristos sequence.matchLength -= length1; 1065*3117ece4Schristos match = prefixStart; 1066*3117ece4Schristos } 1067*3117ece4Schristos } 1068*3117ece4Schristos /* Match within prefix of 1 or more bytes */ 1069*3117ece4Schristos assert(op <= oMatchEnd); 1070*3117ece4Schristos assert(oMatchEnd <= oend_w); 1071*3117ece4Schristos assert(match >= prefixStart); 1072*3117ece4Schristos assert(sequence.matchLength >= 1); 1073*3117ece4Schristos 1074*3117ece4Schristos /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy 1075*3117ece4Schristos * without overlap checking. 1076*3117ece4Schristos */ 1077*3117ece4Schristos if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) { 1078*3117ece4Schristos /* We bet on a full wildcopy for matches, since we expect matches to be 1079*3117ece4Schristos * longer than literals (in general). In silesia, ~10% of matches are longer 1080*3117ece4Schristos * than 16 bytes. 1081*3117ece4Schristos */ 1082*3117ece4Schristos ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap); 1083*3117ece4Schristos return sequenceLength; 1084*3117ece4Schristos } 1085*3117ece4Schristos assert(sequence.offset < WILDCOPY_VECLEN); 1086*3117ece4Schristos 1087*3117ece4Schristos /* Copy 8 bytes and spread the offset to be >= 8. */ 1088*3117ece4Schristos ZSTD_overlapCopy8(&op, &match, sequence.offset); 1089*3117ece4Schristos 1090*3117ece4Schristos /* If the match length is > 8 bytes, then continue with the wildcopy. */ 1091*3117ece4Schristos if (sequence.matchLength > 8) { 1092*3117ece4Schristos assert(op < oMatchEnd); 1093*3117ece4Schristos ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8, ZSTD_overlap_src_before_dst); 1094*3117ece4Schristos } 1095*3117ece4Schristos return sequenceLength; 1096*3117ece4Schristos } 1097*3117ece4Schristos 1098*3117ece4Schristos HINT_INLINE 1099*3117ece4Schristos ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 1100*3117ece4Schristos size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op, 1101*3117ece4Schristos BYTE* const oend, const BYTE* const oend_w, seq_t sequence, 1102*3117ece4Schristos const BYTE** litPtr, const BYTE* const litLimit, 1103*3117ece4Schristos const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) 1104*3117ece4Schristos { 1105*3117ece4Schristos BYTE* const oLitEnd = op + sequence.litLength; 1106*3117ece4Schristos size_t const sequenceLength = sequence.litLength + sequence.matchLength; 1107*3117ece4Schristos BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ 1108*3117ece4Schristos const BYTE* const iLitEnd = *litPtr + sequence.litLength; 1109*3117ece4Schristos const BYTE* match = oLitEnd - sequence.offset; 1110*3117ece4Schristos 1111*3117ece4Schristos assert(op != NULL /* Precondition */); 1112*3117ece4Schristos assert(oend_w < oend /* No underflow */); 1113*3117ece4Schristos /* Handle edge cases in a slow path: 1114*3117ece4Schristos * - Read beyond end of literals 1115*3117ece4Schristos * - Match end is within WILDCOPY_OVERLIMIT of oend 1116*3117ece4Schristos * - 32-bit mode and the match length overflows 1117*3117ece4Schristos */ 1118*3117ece4Schristos if (UNLIKELY( 1119*3117ece4Schristos iLitEnd > litLimit || 1120*3117ece4Schristos oMatchEnd > oend_w || 1121*3117ece4Schristos (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH))) 1122*3117ece4Schristos return ZSTD_execSequenceEndSplitLitBuffer(op, oend, oend_w, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd); 1123*3117ece4Schristos 1124*3117ece4Schristos /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */ 1125*3117ece4Schristos assert(op <= oLitEnd /* No overflow */); 1126*3117ece4Schristos assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */); 1127*3117ece4Schristos assert(oMatchEnd <= oend /* No underflow */); 1128*3117ece4Schristos assert(iLitEnd <= litLimit /* Literal length is in bounds */); 1129*3117ece4Schristos assert(oLitEnd <= oend_w /* Can wildcopy literals */); 1130*3117ece4Schristos assert(oMatchEnd <= oend_w /* Can wildcopy matches */); 1131*3117ece4Schristos 1132*3117ece4Schristos /* Copy Literals: 1133*3117ece4Schristos * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9. 1134*3117ece4Schristos * We likely don't need the full 32-byte wildcopy. 1135*3117ece4Schristos */ 1136*3117ece4Schristos assert(WILDCOPY_OVERLENGTH >= 16); 1137*3117ece4Schristos ZSTD_copy16(op, (*litPtr)); 1138*3117ece4Schristos if (UNLIKELY(sequence.litLength > 16)) { 1139*3117ece4Schristos ZSTD_wildcopy(op+16, (*litPtr)+16, sequence.litLength-16, ZSTD_no_overlap); 1140*3117ece4Schristos } 1141*3117ece4Schristos op = oLitEnd; 1142*3117ece4Schristos *litPtr = iLitEnd; /* update for next sequence */ 1143*3117ece4Schristos 1144*3117ece4Schristos /* Copy Match */ 1145*3117ece4Schristos if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { 1146*3117ece4Schristos /* offset beyond prefix -> go into extDict */ 1147*3117ece4Schristos RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, ""); 1148*3117ece4Schristos match = dictEnd + (match - prefixStart); 1149*3117ece4Schristos if (match + sequence.matchLength <= dictEnd) { 1150*3117ece4Schristos ZSTD_memmove(oLitEnd, match, sequence.matchLength); 1151*3117ece4Schristos return sequenceLength; 1152*3117ece4Schristos } 1153*3117ece4Schristos /* span extDict & currentPrefixSegment */ 1154*3117ece4Schristos { size_t const length1 = dictEnd - match; 1155*3117ece4Schristos ZSTD_memmove(oLitEnd, match, length1); 1156*3117ece4Schristos op = oLitEnd + length1; 1157*3117ece4Schristos sequence.matchLength -= length1; 1158*3117ece4Schristos match = prefixStart; 1159*3117ece4Schristos } } 1160*3117ece4Schristos /* Match within prefix of 1 or more bytes */ 1161*3117ece4Schristos assert(op <= oMatchEnd); 1162*3117ece4Schristos assert(oMatchEnd <= oend_w); 1163*3117ece4Schristos assert(match >= prefixStart); 1164*3117ece4Schristos assert(sequence.matchLength >= 1); 1165*3117ece4Schristos 1166*3117ece4Schristos /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy 1167*3117ece4Schristos * without overlap checking. 1168*3117ece4Schristos */ 1169*3117ece4Schristos if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) { 1170*3117ece4Schristos /* We bet on a full wildcopy for matches, since we expect matches to be 1171*3117ece4Schristos * longer than literals (in general). In silesia, ~10% of matches are longer 1172*3117ece4Schristos * than 16 bytes. 1173*3117ece4Schristos */ 1174*3117ece4Schristos ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap); 1175*3117ece4Schristos return sequenceLength; 1176*3117ece4Schristos } 1177*3117ece4Schristos assert(sequence.offset < WILDCOPY_VECLEN); 1178*3117ece4Schristos 1179*3117ece4Schristos /* Copy 8 bytes and spread the offset to be >= 8. */ 1180*3117ece4Schristos ZSTD_overlapCopy8(&op, &match, sequence.offset); 1181*3117ece4Schristos 1182*3117ece4Schristos /* If the match length is > 8 bytes, then continue with the wildcopy. */ 1183*3117ece4Schristos if (sequence.matchLength > 8) { 1184*3117ece4Schristos assert(op < oMatchEnd); 1185*3117ece4Schristos ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8, ZSTD_overlap_src_before_dst); 1186*3117ece4Schristos } 1187*3117ece4Schristos return sequenceLength; 1188*3117ece4Schristos } 1189*3117ece4Schristos 1190*3117ece4Schristos 1191*3117ece4Schristos static void 1192*3117ece4Schristos ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt) 1193*3117ece4Schristos { 1194*3117ece4Schristos const void* ptr = dt; 1195*3117ece4Schristos const ZSTD_seqSymbol_header* const DTableH = (const ZSTD_seqSymbol_header*)ptr; 1196*3117ece4Schristos DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog); 1197*3117ece4Schristos DEBUGLOG(6, "ZSTD_initFseState : val=%u using %u bits", 1198*3117ece4Schristos (U32)DStatePtr->state, DTableH->tableLog); 1199*3117ece4Schristos BIT_reloadDStream(bitD); 1200*3117ece4Schristos DStatePtr->table = dt + 1; 1201*3117ece4Schristos } 1202*3117ece4Schristos 1203*3117ece4Schristos FORCE_INLINE_TEMPLATE void 1204*3117ece4Schristos ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 nextState, U32 nbBits) 1205*3117ece4Schristos { 1206*3117ece4Schristos size_t const lowBits = BIT_readBits(bitD, nbBits); 1207*3117ece4Schristos DStatePtr->state = nextState + lowBits; 1208*3117ece4Schristos } 1209*3117ece4Schristos 1210*3117ece4Schristos /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum 1211*3117ece4Schristos * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32 1212*3117ece4Schristos * bits before reloading. This value is the maximum number of bytes we read 1213*3117ece4Schristos * after reloading when we are decoding long offsets. 1214*3117ece4Schristos */ 1215*3117ece4Schristos #define LONG_OFFSETS_MAX_EXTRA_BITS_32 \ 1216*3117ece4Schristos (ZSTD_WINDOWLOG_MAX_32 > STREAM_ACCUMULATOR_MIN_32 \ 1217*3117ece4Schristos ? ZSTD_WINDOWLOG_MAX_32 - STREAM_ACCUMULATOR_MIN_32 \ 1218*3117ece4Schristos : 0) 1219*3117ece4Schristos 1220*3117ece4Schristos typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e; 1221*3117ece4Schristos 1222*3117ece4Schristos /** 1223*3117ece4Schristos * ZSTD_decodeSequence(): 1224*3117ece4Schristos * @p longOffsets : tells the decoder to reload more bit while decoding large offsets 1225*3117ece4Schristos * only used in 32-bit mode 1226*3117ece4Schristos * @return : Sequence (litL + matchL + offset) 1227*3117ece4Schristos */ 1228*3117ece4Schristos FORCE_INLINE_TEMPLATE seq_t 1229*3117ece4Schristos ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const int isLastSeq) 1230*3117ece4Schristos { 1231*3117ece4Schristos seq_t seq; 1232*3117ece4Schristos /* 1233*3117ece4Schristos * ZSTD_seqSymbol is a 64 bits wide structure. 1234*3117ece4Schristos * It can be loaded in one operation 1235*3117ece4Schristos * and its fields extracted by simply shifting or bit-extracting on aarch64. 1236*3117ece4Schristos * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh 1237*3117ece4Schristos * operations that cause performance drop. This can be avoided by using this 1238*3117ece4Schristos * ZSTD_memcpy hack. 1239*3117ece4Schristos */ 1240*3117ece4Schristos #if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__)) 1241*3117ece4Schristos ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS; 1242*3117ece4Schristos ZSTD_seqSymbol* const llDInfo = &llDInfoS; 1243*3117ece4Schristos ZSTD_seqSymbol* const mlDInfo = &mlDInfoS; 1244*3117ece4Schristos ZSTD_seqSymbol* const ofDInfo = &ofDInfoS; 1245*3117ece4Schristos ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol)); 1246*3117ece4Schristos ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol)); 1247*3117ece4Schristos ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol)); 1248*3117ece4Schristos #else 1249*3117ece4Schristos const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state; 1250*3117ece4Schristos const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state; 1251*3117ece4Schristos const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state; 1252*3117ece4Schristos #endif 1253*3117ece4Schristos seq.matchLength = mlDInfo->baseValue; 1254*3117ece4Schristos seq.litLength = llDInfo->baseValue; 1255*3117ece4Schristos { U32 const ofBase = ofDInfo->baseValue; 1256*3117ece4Schristos BYTE const llBits = llDInfo->nbAdditionalBits; 1257*3117ece4Schristos BYTE const mlBits = mlDInfo->nbAdditionalBits; 1258*3117ece4Schristos BYTE const ofBits = ofDInfo->nbAdditionalBits; 1259*3117ece4Schristos BYTE const totalBits = llBits+mlBits+ofBits; 1260*3117ece4Schristos 1261*3117ece4Schristos U16 const llNext = llDInfo->nextState; 1262*3117ece4Schristos U16 const mlNext = mlDInfo->nextState; 1263*3117ece4Schristos U16 const ofNext = ofDInfo->nextState; 1264*3117ece4Schristos U32 const llnbBits = llDInfo->nbBits; 1265*3117ece4Schristos U32 const mlnbBits = mlDInfo->nbBits; 1266*3117ece4Schristos U32 const ofnbBits = ofDInfo->nbBits; 1267*3117ece4Schristos 1268*3117ece4Schristos assert(llBits <= MaxLLBits); 1269*3117ece4Schristos assert(mlBits <= MaxMLBits); 1270*3117ece4Schristos assert(ofBits <= MaxOff); 1271*3117ece4Schristos /* 1272*3117ece4Schristos * As gcc has better branch and block analyzers, sometimes it is only 1273*3117ece4Schristos * valuable to mark likeliness for clang, it gives around 3-4% of 1274*3117ece4Schristos * performance. 1275*3117ece4Schristos */ 1276*3117ece4Schristos 1277*3117ece4Schristos /* sequence */ 1278*3117ece4Schristos { size_t offset; 1279*3117ece4Schristos if (ofBits > 1) { 1280*3117ece4Schristos ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); 1281*3117ece4Schristos ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); 1282*3117ece4Schristos ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32); 1283*3117ece4Schristos ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits); 1284*3117ece4Schristos if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { 1285*3117ece4Schristos /* Always read extra bits, this keeps the logic simple, 1286*3117ece4Schristos * avoids branches, and avoids accidentally reading 0 bits. 1287*3117ece4Schristos */ 1288*3117ece4Schristos U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32; 1289*3117ece4Schristos offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); 1290*3117ece4Schristos BIT_reloadDStream(&seqState->DStream); 1291*3117ece4Schristos offset += BIT_readBitsFast(&seqState->DStream, extraBits); 1292*3117ece4Schristos } else { 1293*3117ece4Schristos offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ 1294*3117ece4Schristos if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); 1295*3117ece4Schristos } 1296*3117ece4Schristos seqState->prevOffset[2] = seqState->prevOffset[1]; 1297*3117ece4Schristos seqState->prevOffset[1] = seqState->prevOffset[0]; 1298*3117ece4Schristos seqState->prevOffset[0] = offset; 1299*3117ece4Schristos } else { 1300*3117ece4Schristos U32 const ll0 = (llDInfo->baseValue == 0); 1301*3117ece4Schristos if (LIKELY((ofBits == 0))) { 1302*3117ece4Schristos offset = seqState->prevOffset[ll0]; 1303*3117ece4Schristos seqState->prevOffset[1] = seqState->prevOffset[!ll0]; 1304*3117ece4Schristos seqState->prevOffset[0] = offset; 1305*3117ece4Schristos } else { 1306*3117ece4Schristos offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1); 1307*3117ece4Schristos { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; 1308*3117ece4Schristos temp -= !temp; /* 0 is not valid: input corrupted => force offset to -1 => corruption detected at execSequence */ 1309*3117ece4Schristos if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; 1310*3117ece4Schristos seqState->prevOffset[1] = seqState->prevOffset[0]; 1311*3117ece4Schristos seqState->prevOffset[0] = offset = temp; 1312*3117ece4Schristos } } } 1313*3117ece4Schristos seq.offset = offset; 1314*3117ece4Schristos } 1315*3117ece4Schristos 1316*3117ece4Schristos if (mlBits > 0) 1317*3117ece4Schristos seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); 1318*3117ece4Schristos 1319*3117ece4Schristos if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) 1320*3117ece4Schristos BIT_reloadDStream(&seqState->DStream); 1321*3117ece4Schristos if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog))) 1322*3117ece4Schristos BIT_reloadDStream(&seqState->DStream); 1323*3117ece4Schristos /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ 1324*3117ece4Schristos ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); 1325*3117ece4Schristos 1326*3117ece4Schristos if (llBits > 0) 1327*3117ece4Schristos seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); 1328*3117ece4Schristos 1329*3117ece4Schristos if (MEM_32bits()) 1330*3117ece4Schristos BIT_reloadDStream(&seqState->DStream); 1331*3117ece4Schristos 1332*3117ece4Schristos DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", 1333*3117ece4Schristos (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); 1334*3117ece4Schristos 1335*3117ece4Schristos if (!isLastSeq) { 1336*3117ece4Schristos /* don't update FSE state for last Sequence */ 1337*3117ece4Schristos ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */ 1338*3117ece4Schristos ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */ 1339*3117ece4Schristos if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ 1340*3117ece4Schristos ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */ 1341*3117ece4Schristos BIT_reloadDStream(&seqState->DStream); 1342*3117ece4Schristos } 1343*3117ece4Schristos } 1344*3117ece4Schristos 1345*3117ece4Schristos return seq; 1346*3117ece4Schristos } 1347*3117ece4Schristos 1348*3117ece4Schristos #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1349*3117ece4Schristos #if DEBUGLEVEL >= 1 1350*3117ece4Schristos static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd) 1351*3117ece4Schristos { 1352*3117ece4Schristos size_t const windowSize = dctx->fParams.windowSize; 1353*3117ece4Schristos /* No dictionary used. */ 1354*3117ece4Schristos if (dctx->dictContentEndForFuzzing == NULL) return 0; 1355*3117ece4Schristos /* Dictionary is our prefix. */ 1356*3117ece4Schristos if (prefixStart == dctx->dictContentBeginForFuzzing) return 1; 1357*3117ece4Schristos /* Dictionary is not our ext-dict. */ 1358*3117ece4Schristos if (dctx->dictEnd != dctx->dictContentEndForFuzzing) return 0; 1359*3117ece4Schristos /* Dictionary is not within our window size. */ 1360*3117ece4Schristos if ((size_t)(oLitEnd - prefixStart) >= windowSize) return 0; 1361*3117ece4Schristos /* Dictionary is active. */ 1362*3117ece4Schristos return 1; 1363*3117ece4Schristos } 1364*3117ece4Schristos #endif 1365*3117ece4Schristos 1366*3117ece4Schristos static void ZSTD_assertValidSequence( 1367*3117ece4Schristos ZSTD_DCtx const* dctx, 1368*3117ece4Schristos BYTE const* op, BYTE const* oend, 1369*3117ece4Schristos seq_t const seq, 1370*3117ece4Schristos BYTE const* prefixStart, BYTE const* virtualStart) 1371*3117ece4Schristos { 1372*3117ece4Schristos #if DEBUGLEVEL >= 1 1373*3117ece4Schristos if (dctx->isFrameDecompression) { 1374*3117ece4Schristos size_t const windowSize = dctx->fParams.windowSize; 1375*3117ece4Schristos size_t const sequenceSize = seq.litLength + seq.matchLength; 1376*3117ece4Schristos BYTE const* const oLitEnd = op + seq.litLength; 1377*3117ece4Schristos DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u", 1378*3117ece4Schristos (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); 1379*3117ece4Schristos assert(op <= oend); 1380*3117ece4Schristos assert((size_t)(oend - op) >= sequenceSize); 1381*3117ece4Schristos assert(sequenceSize <= ZSTD_blockSizeMax(dctx)); 1382*3117ece4Schristos if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) { 1383*3117ece4Schristos size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing); 1384*3117ece4Schristos /* Offset must be within the dictionary. */ 1385*3117ece4Schristos assert(seq.offset <= (size_t)(oLitEnd - virtualStart)); 1386*3117ece4Schristos assert(seq.offset <= windowSize + dictSize); 1387*3117ece4Schristos } else { 1388*3117ece4Schristos /* Offset must be within our window. */ 1389*3117ece4Schristos assert(seq.offset <= windowSize); 1390*3117ece4Schristos } 1391*3117ece4Schristos } 1392*3117ece4Schristos #else 1393*3117ece4Schristos (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart; 1394*3117ece4Schristos #endif 1395*3117ece4Schristos } 1396*3117ece4Schristos #endif 1397*3117ece4Schristos 1398*3117ece4Schristos #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG 1399*3117ece4Schristos 1400*3117ece4Schristos 1401*3117ece4Schristos FORCE_INLINE_TEMPLATE size_t 1402*3117ece4Schristos DONT_VECTORIZE 1403*3117ece4Schristos ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, 1404*3117ece4Schristos void* dst, size_t maxDstSize, 1405*3117ece4Schristos const void* seqStart, size_t seqSize, int nbSeq, 1406*3117ece4Schristos const ZSTD_longOffset_e isLongOffset) 1407*3117ece4Schristos { 1408*3117ece4Schristos const BYTE* ip = (const BYTE*)seqStart; 1409*3117ece4Schristos const BYTE* const iend = ip + seqSize; 1410*3117ece4Schristos BYTE* const ostart = (BYTE*)dst; 1411*3117ece4Schristos BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, maxDstSize); 1412*3117ece4Schristos BYTE* op = ostart; 1413*3117ece4Schristos const BYTE* litPtr = dctx->litPtr; 1414*3117ece4Schristos const BYTE* litBufferEnd = dctx->litBufferEnd; 1415*3117ece4Schristos const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); 1416*3117ece4Schristos const BYTE* const vBase = (const BYTE*) (dctx->virtualStart); 1417*3117ece4Schristos const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); 1418*3117ece4Schristos DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer (%i seqs)", nbSeq); 1419*3117ece4Schristos 1420*3117ece4Schristos /* Literals are split between internal buffer & output buffer */ 1421*3117ece4Schristos if (nbSeq) { 1422*3117ece4Schristos seqState_t seqState; 1423*3117ece4Schristos dctx->fseEntropy = 1; 1424*3117ece4Schristos { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; } 1425*3117ece4Schristos RETURN_ERROR_IF( 1426*3117ece4Schristos ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)), 1427*3117ece4Schristos corruption_detected, ""); 1428*3117ece4Schristos ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); 1429*3117ece4Schristos ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); 1430*3117ece4Schristos ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); 1431*3117ece4Schristos assert(dst != NULL); 1432*3117ece4Schristos 1433*3117ece4Schristos ZSTD_STATIC_ASSERT( 1434*3117ece4Schristos BIT_DStream_unfinished < BIT_DStream_completed && 1435*3117ece4Schristos BIT_DStream_endOfBuffer < BIT_DStream_completed && 1436*3117ece4Schristos BIT_DStream_completed < BIT_DStream_overflow); 1437*3117ece4Schristos 1438*3117ece4Schristos /* decompress without overrunning litPtr begins */ 1439*3117ece4Schristos { seq_t sequence = {0,0,0}; /* some static analyzer believe that @sequence is not initialized (it necessarily is, since for(;;) loop as at least one iteration) */ 1440*3117ece4Schristos /* Align the decompression loop to 32 + 16 bytes. 1441*3117ece4Schristos * 1442*3117ece4Schristos * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression 1443*3117ece4Schristos * speed swings based on the alignment of the decompression loop. This 1444*3117ece4Schristos * performance swing is caused by parts of the decompression loop falling 1445*3117ece4Schristos * out of the DSB. The entire decompression loop should fit in the DSB, 1446*3117ece4Schristos * when it can't we get much worse performance. You can measure if you've 1447*3117ece4Schristos * hit the good case or the bad case with this perf command for some 1448*3117ece4Schristos * compressed file test.zst: 1449*3117ece4Schristos * 1450*3117ece4Schristos * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \ 1451*3117ece4Schristos * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst 1452*3117ece4Schristos * 1453*3117ece4Schristos * If you see most cycles served out of the MITE you've hit the bad case. 1454*3117ece4Schristos * If you see most cycles served out of the DSB you've hit the good case. 1455*3117ece4Schristos * If it is pretty even then you may be in an okay case. 1456*3117ece4Schristos * 1457*3117ece4Schristos * This issue has been reproduced on the following CPUs: 1458*3117ece4Schristos * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9 1459*3117ece4Schristos * Use Instruments->Counters to get DSB/MITE cycles. 1460*3117ece4Schristos * I never got performance swings, but I was able to 1461*3117ece4Schristos * go from the good case of mostly DSB to half of the 1462*3117ece4Schristos * cycles served from MITE. 1463*3117ece4Schristos * - Coffeelake: Intel i9-9900k 1464*3117ece4Schristos * - Coffeelake: Intel i7-9700k 1465*3117ece4Schristos * 1466*3117ece4Schristos * I haven't been able to reproduce the instability or DSB misses on any 1467*3117ece4Schristos * of the following CPUS: 1468*3117ece4Schristos * - Haswell 1469*3117ece4Schristos * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH 1470*3117ece4Schristos * - Skylake 1471*3117ece4Schristos * 1472*3117ece4Schristos * Alignment is done for each of the three major decompression loops: 1473*3117ece4Schristos * - ZSTD_decompressSequences_bodySplitLitBuffer - presplit section of the literal buffer 1474*3117ece4Schristos * - ZSTD_decompressSequences_bodySplitLitBuffer - postsplit section of the literal buffer 1475*3117ece4Schristos * - ZSTD_decompressSequences_body 1476*3117ece4Schristos * Alignment choices are made to minimize large swings on bad cases and influence on performance 1477*3117ece4Schristos * from changes external to this code, rather than to overoptimize on the current commit. 1478*3117ece4Schristos * 1479*3117ece4Schristos * If you are seeing performance stability this script can help test. 1480*3117ece4Schristos * It tests on 4 commits in zstd where I saw performance change. 1481*3117ece4Schristos * 1482*3117ece4Schristos * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4 1483*3117ece4Schristos */ 1484*3117ece4Schristos #if defined(__GNUC__) && defined(__x86_64__) 1485*3117ece4Schristos __asm__(".p2align 6"); 1486*3117ece4Schristos # if __GNUC__ >= 7 1487*3117ece4Schristos /* good for gcc-7, gcc-9, and gcc-11 */ 1488*3117ece4Schristos __asm__("nop"); 1489*3117ece4Schristos __asm__(".p2align 5"); 1490*3117ece4Schristos __asm__("nop"); 1491*3117ece4Schristos __asm__(".p2align 4"); 1492*3117ece4Schristos # if __GNUC__ == 8 || __GNUC__ == 10 1493*3117ece4Schristos /* good for gcc-8 and gcc-10 */ 1494*3117ece4Schristos __asm__("nop"); 1495*3117ece4Schristos __asm__(".p2align 3"); 1496*3117ece4Schristos # endif 1497*3117ece4Schristos # endif 1498*3117ece4Schristos #endif 1499*3117ece4Schristos 1500*3117ece4Schristos /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */ 1501*3117ece4Schristos for ( ; nbSeq; nbSeq--) { 1502*3117ece4Schristos sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); 1503*3117ece4Schristos if (litPtr + sequence.litLength > dctx->litBufferEnd) break; 1504*3117ece4Schristos { size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); 1505*3117ece4Schristos #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1506*3117ece4Schristos assert(!ZSTD_isError(oneSeqSize)); 1507*3117ece4Schristos ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); 1508*3117ece4Schristos #endif 1509*3117ece4Schristos if (UNLIKELY(ZSTD_isError(oneSeqSize))) 1510*3117ece4Schristos return oneSeqSize; 1511*3117ece4Schristos DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); 1512*3117ece4Schristos op += oneSeqSize; 1513*3117ece4Schristos } } 1514*3117ece4Schristos DEBUGLOG(6, "reached: (litPtr + sequence.litLength > dctx->litBufferEnd)"); 1515*3117ece4Schristos 1516*3117ece4Schristos /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */ 1517*3117ece4Schristos if (nbSeq > 0) { 1518*3117ece4Schristos const size_t leftoverLit = dctx->litBufferEnd - litPtr; 1519*3117ece4Schristos DEBUGLOG(6, "There are %i sequences left, and %zu/%zu literals left in buffer", nbSeq, leftoverLit, sequence.litLength); 1520*3117ece4Schristos if (leftoverLit) { 1521*3117ece4Schristos RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); 1522*3117ece4Schristos ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); 1523*3117ece4Schristos sequence.litLength -= leftoverLit; 1524*3117ece4Schristos op += leftoverLit; 1525*3117ece4Schristos } 1526*3117ece4Schristos litPtr = dctx->litExtraBuffer; 1527*3117ece4Schristos litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; 1528*3117ece4Schristos dctx->litBufferLocation = ZSTD_not_in_dst; 1529*3117ece4Schristos { size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); 1530*3117ece4Schristos #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1531*3117ece4Schristos assert(!ZSTD_isError(oneSeqSize)); 1532*3117ece4Schristos ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); 1533*3117ece4Schristos #endif 1534*3117ece4Schristos if (UNLIKELY(ZSTD_isError(oneSeqSize))) 1535*3117ece4Schristos return oneSeqSize; 1536*3117ece4Schristos DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); 1537*3117ece4Schristos op += oneSeqSize; 1538*3117ece4Schristos } 1539*3117ece4Schristos nbSeq--; 1540*3117ece4Schristos } 1541*3117ece4Schristos } 1542*3117ece4Schristos 1543*3117ece4Schristos if (nbSeq > 0) { 1544*3117ece4Schristos /* there is remaining lit from extra buffer */ 1545*3117ece4Schristos 1546*3117ece4Schristos #if defined(__GNUC__) && defined(__x86_64__) 1547*3117ece4Schristos __asm__(".p2align 6"); 1548*3117ece4Schristos __asm__("nop"); 1549*3117ece4Schristos # if __GNUC__ != 7 1550*3117ece4Schristos /* worse for gcc-7 better for gcc-8, gcc-9, and gcc-10 and clang */ 1551*3117ece4Schristos __asm__(".p2align 4"); 1552*3117ece4Schristos __asm__("nop"); 1553*3117ece4Schristos __asm__(".p2align 3"); 1554*3117ece4Schristos # elif __GNUC__ >= 11 1555*3117ece4Schristos __asm__(".p2align 3"); 1556*3117ece4Schristos # else 1557*3117ece4Schristos __asm__(".p2align 5"); 1558*3117ece4Schristos __asm__("nop"); 1559*3117ece4Schristos __asm__(".p2align 3"); 1560*3117ece4Schristos # endif 1561*3117ece4Schristos #endif 1562*3117ece4Schristos 1563*3117ece4Schristos for ( ; nbSeq ; nbSeq--) { 1564*3117ece4Schristos seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); 1565*3117ece4Schristos size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); 1566*3117ece4Schristos #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1567*3117ece4Schristos assert(!ZSTD_isError(oneSeqSize)); 1568*3117ece4Schristos ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); 1569*3117ece4Schristos #endif 1570*3117ece4Schristos if (UNLIKELY(ZSTD_isError(oneSeqSize))) 1571*3117ece4Schristos return oneSeqSize; 1572*3117ece4Schristos DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); 1573*3117ece4Schristos op += oneSeqSize; 1574*3117ece4Schristos } 1575*3117ece4Schristos } 1576*3117ece4Schristos 1577*3117ece4Schristos /* check if reached exact end */ 1578*3117ece4Schristos DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq); 1579*3117ece4Schristos RETURN_ERROR_IF(nbSeq, corruption_detected, ""); 1580*3117ece4Schristos DEBUGLOG(5, "bitStream : start=%p, ptr=%p, bitsConsumed=%u", seqState.DStream.start, seqState.DStream.ptr, seqState.DStream.bitsConsumed); 1581*3117ece4Schristos RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); 1582*3117ece4Schristos /* save reps for next block */ 1583*3117ece4Schristos { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); } 1584*3117ece4Schristos } 1585*3117ece4Schristos 1586*3117ece4Schristos /* last literal segment */ 1587*3117ece4Schristos if (dctx->litBufferLocation == ZSTD_split) { 1588*3117ece4Schristos /* split hasn't been reached yet, first get dst then copy litExtraBuffer */ 1589*3117ece4Schristos size_t const lastLLSize = (size_t)(litBufferEnd - litPtr); 1590*3117ece4Schristos DEBUGLOG(6, "copy last literals from segment : %u", (U32)lastLLSize); 1591*3117ece4Schristos RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); 1592*3117ece4Schristos if (op != NULL) { 1593*3117ece4Schristos ZSTD_memmove(op, litPtr, lastLLSize); 1594*3117ece4Schristos op += lastLLSize; 1595*3117ece4Schristos } 1596*3117ece4Schristos litPtr = dctx->litExtraBuffer; 1597*3117ece4Schristos litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; 1598*3117ece4Schristos dctx->litBufferLocation = ZSTD_not_in_dst; 1599*3117ece4Schristos } 1600*3117ece4Schristos /* copy last literals from internal buffer */ 1601*3117ece4Schristos { size_t const lastLLSize = (size_t)(litBufferEnd - litPtr); 1602*3117ece4Schristos DEBUGLOG(6, "copy last literals from internal buffer : %u", (U32)lastLLSize); 1603*3117ece4Schristos RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); 1604*3117ece4Schristos if (op != NULL) { 1605*3117ece4Schristos ZSTD_memcpy(op, litPtr, lastLLSize); 1606*3117ece4Schristos op += lastLLSize; 1607*3117ece4Schristos } } 1608*3117ece4Schristos 1609*3117ece4Schristos DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart)); 1610*3117ece4Schristos return (size_t)(op - ostart); 1611*3117ece4Schristos } 1612*3117ece4Schristos 1613*3117ece4Schristos FORCE_INLINE_TEMPLATE size_t 1614*3117ece4Schristos DONT_VECTORIZE 1615*3117ece4Schristos ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, 1616*3117ece4Schristos void* dst, size_t maxDstSize, 1617*3117ece4Schristos const void* seqStart, size_t seqSize, int nbSeq, 1618*3117ece4Schristos const ZSTD_longOffset_e isLongOffset) 1619*3117ece4Schristos { 1620*3117ece4Schristos const BYTE* ip = (const BYTE*)seqStart; 1621*3117ece4Schristos const BYTE* const iend = ip + seqSize; 1622*3117ece4Schristos BYTE* const ostart = (BYTE*)dst; 1623*3117ece4Schristos BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ZSTD_maybeNullPtrAdd(ostart, maxDstSize) : dctx->litBuffer; 1624*3117ece4Schristos BYTE* op = ostart; 1625*3117ece4Schristos const BYTE* litPtr = dctx->litPtr; 1626*3117ece4Schristos const BYTE* const litEnd = litPtr + dctx->litSize; 1627*3117ece4Schristos const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart); 1628*3117ece4Schristos const BYTE* const vBase = (const BYTE*)(dctx->virtualStart); 1629*3117ece4Schristos const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd); 1630*3117ece4Schristos DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq); 1631*3117ece4Schristos 1632*3117ece4Schristos /* Regen sequences */ 1633*3117ece4Schristos if (nbSeq) { 1634*3117ece4Schristos seqState_t seqState; 1635*3117ece4Schristos dctx->fseEntropy = 1; 1636*3117ece4Schristos { U32 i; for (i = 0; i < ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; } 1637*3117ece4Schristos RETURN_ERROR_IF( 1638*3117ece4Schristos ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend - ip)), 1639*3117ece4Schristos corruption_detected, ""); 1640*3117ece4Schristos ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); 1641*3117ece4Schristos ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); 1642*3117ece4Schristos ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); 1643*3117ece4Schristos assert(dst != NULL); 1644*3117ece4Schristos 1645*3117ece4Schristos #if defined(__GNUC__) && defined(__x86_64__) 1646*3117ece4Schristos __asm__(".p2align 6"); 1647*3117ece4Schristos __asm__("nop"); 1648*3117ece4Schristos # if __GNUC__ >= 7 1649*3117ece4Schristos __asm__(".p2align 5"); 1650*3117ece4Schristos __asm__("nop"); 1651*3117ece4Schristos __asm__(".p2align 3"); 1652*3117ece4Schristos # else 1653*3117ece4Schristos __asm__(".p2align 4"); 1654*3117ece4Schristos __asm__("nop"); 1655*3117ece4Schristos __asm__(".p2align 3"); 1656*3117ece4Schristos # endif 1657*3117ece4Schristos #endif 1658*3117ece4Schristos 1659*3117ece4Schristos for ( ; nbSeq ; nbSeq--) { 1660*3117ece4Schristos seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); 1661*3117ece4Schristos size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd); 1662*3117ece4Schristos #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1663*3117ece4Schristos assert(!ZSTD_isError(oneSeqSize)); 1664*3117ece4Schristos ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); 1665*3117ece4Schristos #endif 1666*3117ece4Schristos if (UNLIKELY(ZSTD_isError(oneSeqSize))) 1667*3117ece4Schristos return oneSeqSize; 1668*3117ece4Schristos DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); 1669*3117ece4Schristos op += oneSeqSize; 1670*3117ece4Schristos } 1671*3117ece4Schristos 1672*3117ece4Schristos /* check if reached exact end */ 1673*3117ece4Schristos assert(nbSeq == 0); 1674*3117ece4Schristos RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); 1675*3117ece4Schristos /* save reps for next block */ 1676*3117ece4Schristos { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); } 1677*3117ece4Schristos } 1678*3117ece4Schristos 1679*3117ece4Schristos /* last literal segment */ 1680*3117ece4Schristos { size_t const lastLLSize = (size_t)(litEnd - litPtr); 1681*3117ece4Schristos DEBUGLOG(6, "copy last literals : %u", (U32)lastLLSize); 1682*3117ece4Schristos RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); 1683*3117ece4Schristos if (op != NULL) { 1684*3117ece4Schristos ZSTD_memcpy(op, litPtr, lastLLSize); 1685*3117ece4Schristos op += lastLLSize; 1686*3117ece4Schristos } } 1687*3117ece4Schristos 1688*3117ece4Schristos DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart)); 1689*3117ece4Schristos return (size_t)(op - ostart); 1690*3117ece4Schristos } 1691*3117ece4Schristos 1692*3117ece4Schristos static size_t 1693*3117ece4Schristos ZSTD_decompressSequences_default(ZSTD_DCtx* dctx, 1694*3117ece4Schristos void* dst, size_t maxDstSize, 1695*3117ece4Schristos const void* seqStart, size_t seqSize, int nbSeq, 1696*3117ece4Schristos const ZSTD_longOffset_e isLongOffset) 1697*3117ece4Schristos { 1698*3117ece4Schristos return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1699*3117ece4Schristos } 1700*3117ece4Schristos 1701*3117ece4Schristos static size_t 1702*3117ece4Schristos ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx, 1703*3117ece4Schristos void* dst, size_t maxDstSize, 1704*3117ece4Schristos const void* seqStart, size_t seqSize, int nbSeq, 1705*3117ece4Schristos const ZSTD_longOffset_e isLongOffset) 1706*3117ece4Schristos { 1707*3117ece4Schristos return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1708*3117ece4Schristos } 1709*3117ece4Schristos #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ 1710*3117ece4Schristos 1711*3117ece4Schristos #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT 1712*3117ece4Schristos 1713*3117ece4Schristos FORCE_INLINE_TEMPLATE 1714*3117ece4Schristos 1715*3117ece4Schristos size_t ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence, 1716*3117ece4Schristos const BYTE* const prefixStart, const BYTE* const dictEnd) 1717*3117ece4Schristos { 1718*3117ece4Schristos prefetchPos += sequence.litLength; 1719*3117ece4Schristos { const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart; 1720*3117ece4Schristos /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. 1721*3117ece4Schristos * No consequence though : memory address is only used for prefetching, not for dereferencing */ 1722*3117ece4Schristos const BYTE* const match = ZSTD_wrappedPtrSub(ZSTD_wrappedPtrAdd(matchBase, prefetchPos), sequence.offset); 1723*3117ece4Schristos PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */ 1724*3117ece4Schristos } 1725*3117ece4Schristos return prefetchPos + sequence.matchLength; 1726*3117ece4Schristos } 1727*3117ece4Schristos 1728*3117ece4Schristos /* This decoding function employs prefetching 1729*3117ece4Schristos * to reduce latency impact of cache misses. 1730*3117ece4Schristos * It's generally employed when block contains a significant portion of long-distance matches 1731*3117ece4Schristos * or when coupled with a "cold" dictionary */ 1732*3117ece4Schristos FORCE_INLINE_TEMPLATE size_t 1733*3117ece4Schristos ZSTD_decompressSequencesLong_body( 1734*3117ece4Schristos ZSTD_DCtx* dctx, 1735*3117ece4Schristos void* dst, size_t maxDstSize, 1736*3117ece4Schristos const void* seqStart, size_t seqSize, int nbSeq, 1737*3117ece4Schristos const ZSTD_longOffset_e isLongOffset) 1738*3117ece4Schristos { 1739*3117ece4Schristos const BYTE* ip = (const BYTE*)seqStart; 1740*3117ece4Schristos const BYTE* const iend = ip + seqSize; 1741*3117ece4Schristos BYTE* const ostart = (BYTE*)dst; 1742*3117ece4Schristos BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ZSTD_maybeNullPtrAdd(ostart, maxDstSize); 1743*3117ece4Schristos BYTE* op = ostart; 1744*3117ece4Schristos const BYTE* litPtr = dctx->litPtr; 1745*3117ece4Schristos const BYTE* litBufferEnd = dctx->litBufferEnd; 1746*3117ece4Schristos const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); 1747*3117ece4Schristos const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart); 1748*3117ece4Schristos const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); 1749*3117ece4Schristos 1750*3117ece4Schristos /* Regen sequences */ 1751*3117ece4Schristos if (nbSeq) { 1752*3117ece4Schristos #define STORED_SEQS 8 1753*3117ece4Schristos #define STORED_SEQS_MASK (STORED_SEQS-1) 1754*3117ece4Schristos #define ADVANCED_SEQS STORED_SEQS 1755*3117ece4Schristos seq_t sequences[STORED_SEQS]; 1756*3117ece4Schristos int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS); 1757*3117ece4Schristos seqState_t seqState; 1758*3117ece4Schristos int seqNb; 1759*3117ece4Schristos size_t prefetchPos = (size_t)(op-prefixStart); /* track position relative to prefixStart */ 1760*3117ece4Schristos 1761*3117ece4Schristos dctx->fseEntropy = 1; 1762*3117ece4Schristos { int i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; } 1763*3117ece4Schristos assert(dst != NULL); 1764*3117ece4Schristos assert(iend >= ip); 1765*3117ece4Schristos RETURN_ERROR_IF( 1766*3117ece4Schristos ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)), 1767*3117ece4Schristos corruption_detected, ""); 1768*3117ece4Schristos ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); 1769*3117ece4Schristos ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); 1770*3117ece4Schristos ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); 1771*3117ece4Schristos 1772*3117ece4Schristos /* prepare in advance */ 1773*3117ece4Schristos for (seqNb=0; seqNb<seqAdvance; seqNb++) { 1774*3117ece4Schristos seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, seqNb == nbSeq-1); 1775*3117ece4Schristos prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); 1776*3117ece4Schristos sequences[seqNb] = sequence; 1777*3117ece4Schristos } 1778*3117ece4Schristos 1779*3117ece4Schristos /* decompress without stomping litBuffer */ 1780*3117ece4Schristos for (; seqNb < nbSeq; seqNb++) { 1781*3117ece4Schristos seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset, seqNb == nbSeq-1); 1782*3117ece4Schristos 1783*3117ece4Schristos if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) { 1784*3117ece4Schristos /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */ 1785*3117ece4Schristos const size_t leftoverLit = dctx->litBufferEnd - litPtr; 1786*3117ece4Schristos if (leftoverLit) 1787*3117ece4Schristos { 1788*3117ece4Schristos RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); 1789*3117ece4Schristos ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); 1790*3117ece4Schristos sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength -= leftoverLit; 1791*3117ece4Schristos op += leftoverLit; 1792*3117ece4Schristos } 1793*3117ece4Schristos litPtr = dctx->litExtraBuffer; 1794*3117ece4Schristos litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; 1795*3117ece4Schristos dctx->litBufferLocation = ZSTD_not_in_dst; 1796*3117ece4Schristos { size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); 1797*3117ece4Schristos #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1798*3117ece4Schristos assert(!ZSTD_isError(oneSeqSize)); 1799*3117ece4Schristos ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); 1800*3117ece4Schristos #endif 1801*3117ece4Schristos if (ZSTD_isError(oneSeqSize)) return oneSeqSize; 1802*3117ece4Schristos 1803*3117ece4Schristos prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); 1804*3117ece4Schristos sequences[seqNb & STORED_SEQS_MASK] = sequence; 1805*3117ece4Schristos op += oneSeqSize; 1806*3117ece4Schristos } } 1807*3117ece4Schristos else 1808*3117ece4Schristos { 1809*3117ece4Schristos /* lit buffer is either wholly contained in first or second split, or not split at all*/ 1810*3117ece4Schristos size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ? 1811*3117ece4Schristos ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) : 1812*3117ece4Schristos ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); 1813*3117ece4Schristos #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1814*3117ece4Schristos assert(!ZSTD_isError(oneSeqSize)); 1815*3117ece4Schristos ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); 1816*3117ece4Schristos #endif 1817*3117ece4Schristos if (ZSTD_isError(oneSeqSize)) return oneSeqSize; 1818*3117ece4Schristos 1819*3117ece4Schristos prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); 1820*3117ece4Schristos sequences[seqNb & STORED_SEQS_MASK] = sequence; 1821*3117ece4Schristos op += oneSeqSize; 1822*3117ece4Schristos } 1823*3117ece4Schristos } 1824*3117ece4Schristos RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); 1825*3117ece4Schristos 1826*3117ece4Schristos /* finish queue */ 1827*3117ece4Schristos seqNb -= seqAdvance; 1828*3117ece4Schristos for ( ; seqNb<nbSeq ; seqNb++) { 1829*3117ece4Schristos seq_t *sequence = &(sequences[seqNb&STORED_SEQS_MASK]); 1830*3117ece4Schristos if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) { 1831*3117ece4Schristos const size_t leftoverLit = dctx->litBufferEnd - litPtr; 1832*3117ece4Schristos if (leftoverLit) { 1833*3117ece4Schristos RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); 1834*3117ece4Schristos ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); 1835*3117ece4Schristos sequence->litLength -= leftoverLit; 1836*3117ece4Schristos op += leftoverLit; 1837*3117ece4Schristos } 1838*3117ece4Schristos litPtr = dctx->litExtraBuffer; 1839*3117ece4Schristos litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; 1840*3117ece4Schristos dctx->litBufferLocation = ZSTD_not_in_dst; 1841*3117ece4Schristos { size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); 1842*3117ece4Schristos #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1843*3117ece4Schristos assert(!ZSTD_isError(oneSeqSize)); 1844*3117ece4Schristos ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); 1845*3117ece4Schristos #endif 1846*3117ece4Schristos if (ZSTD_isError(oneSeqSize)) return oneSeqSize; 1847*3117ece4Schristos op += oneSeqSize; 1848*3117ece4Schristos } 1849*3117ece4Schristos } 1850*3117ece4Schristos else 1851*3117ece4Schristos { 1852*3117ece4Schristos size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ? 1853*3117ece4Schristos ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence->litLength - WILDCOPY_OVERLENGTH, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) : 1854*3117ece4Schristos ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); 1855*3117ece4Schristos #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1856*3117ece4Schristos assert(!ZSTD_isError(oneSeqSize)); 1857*3117ece4Schristos ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); 1858*3117ece4Schristos #endif 1859*3117ece4Schristos if (ZSTD_isError(oneSeqSize)) return oneSeqSize; 1860*3117ece4Schristos op += oneSeqSize; 1861*3117ece4Schristos } 1862*3117ece4Schristos } 1863*3117ece4Schristos 1864*3117ece4Schristos /* save reps for next block */ 1865*3117ece4Schristos { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); } 1866*3117ece4Schristos } 1867*3117ece4Schristos 1868*3117ece4Schristos /* last literal segment */ 1869*3117ece4Schristos if (dctx->litBufferLocation == ZSTD_split) { /* first deplete literal buffer in dst, then copy litExtraBuffer */ 1870*3117ece4Schristos size_t const lastLLSize = litBufferEnd - litPtr; 1871*3117ece4Schristos RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); 1872*3117ece4Schristos if (op != NULL) { 1873*3117ece4Schristos ZSTD_memmove(op, litPtr, lastLLSize); 1874*3117ece4Schristos op += lastLLSize; 1875*3117ece4Schristos } 1876*3117ece4Schristos litPtr = dctx->litExtraBuffer; 1877*3117ece4Schristos litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; 1878*3117ece4Schristos } 1879*3117ece4Schristos { size_t const lastLLSize = litBufferEnd - litPtr; 1880*3117ece4Schristos RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); 1881*3117ece4Schristos if (op != NULL) { 1882*3117ece4Schristos ZSTD_memmove(op, litPtr, lastLLSize); 1883*3117ece4Schristos op += lastLLSize; 1884*3117ece4Schristos } 1885*3117ece4Schristos } 1886*3117ece4Schristos 1887*3117ece4Schristos return (size_t)(op - ostart); 1888*3117ece4Schristos } 1889*3117ece4Schristos 1890*3117ece4Schristos static size_t 1891*3117ece4Schristos ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx, 1892*3117ece4Schristos void* dst, size_t maxDstSize, 1893*3117ece4Schristos const void* seqStart, size_t seqSize, int nbSeq, 1894*3117ece4Schristos const ZSTD_longOffset_e isLongOffset) 1895*3117ece4Schristos { 1896*3117ece4Schristos return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1897*3117ece4Schristos } 1898*3117ece4Schristos #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ 1899*3117ece4Schristos 1900*3117ece4Schristos 1901*3117ece4Schristos 1902*3117ece4Schristos #if DYNAMIC_BMI2 1903*3117ece4Schristos 1904*3117ece4Schristos #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG 1905*3117ece4Schristos static BMI2_TARGET_ATTRIBUTE size_t 1906*3117ece4Schristos DONT_VECTORIZE 1907*3117ece4Schristos ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx, 1908*3117ece4Schristos void* dst, size_t maxDstSize, 1909*3117ece4Schristos const void* seqStart, size_t seqSize, int nbSeq, 1910*3117ece4Schristos const ZSTD_longOffset_e isLongOffset) 1911*3117ece4Schristos { 1912*3117ece4Schristos return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1913*3117ece4Schristos } 1914*3117ece4Schristos static BMI2_TARGET_ATTRIBUTE size_t 1915*3117ece4Schristos DONT_VECTORIZE 1916*3117ece4Schristos ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx, 1917*3117ece4Schristos void* dst, size_t maxDstSize, 1918*3117ece4Schristos const void* seqStart, size_t seqSize, int nbSeq, 1919*3117ece4Schristos const ZSTD_longOffset_e isLongOffset) 1920*3117ece4Schristos { 1921*3117ece4Schristos return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1922*3117ece4Schristos } 1923*3117ece4Schristos #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ 1924*3117ece4Schristos 1925*3117ece4Schristos #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT 1926*3117ece4Schristos static BMI2_TARGET_ATTRIBUTE size_t 1927*3117ece4Schristos ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx, 1928*3117ece4Schristos void* dst, size_t maxDstSize, 1929*3117ece4Schristos const void* seqStart, size_t seqSize, int nbSeq, 1930*3117ece4Schristos const ZSTD_longOffset_e isLongOffset) 1931*3117ece4Schristos { 1932*3117ece4Schristos return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1933*3117ece4Schristos } 1934*3117ece4Schristos #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ 1935*3117ece4Schristos 1936*3117ece4Schristos #endif /* DYNAMIC_BMI2 */ 1937*3117ece4Schristos 1938*3117ece4Schristos typedef size_t (*ZSTD_decompressSequences_t)( 1939*3117ece4Schristos ZSTD_DCtx* dctx, 1940*3117ece4Schristos void* dst, size_t maxDstSize, 1941*3117ece4Schristos const void* seqStart, size_t seqSize, int nbSeq, 1942*3117ece4Schristos const ZSTD_longOffset_e isLongOffset); 1943*3117ece4Schristos 1944*3117ece4Schristos #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG 1945*3117ece4Schristos static size_t 1946*3117ece4Schristos ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, 1947*3117ece4Schristos const void* seqStart, size_t seqSize, int nbSeq, 1948*3117ece4Schristos const ZSTD_longOffset_e isLongOffset) 1949*3117ece4Schristos { 1950*3117ece4Schristos DEBUGLOG(5, "ZSTD_decompressSequences"); 1951*3117ece4Schristos #if DYNAMIC_BMI2 1952*3117ece4Schristos if (ZSTD_DCtx_get_bmi2(dctx)) { 1953*3117ece4Schristos return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1954*3117ece4Schristos } 1955*3117ece4Schristos #endif 1956*3117ece4Schristos return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1957*3117ece4Schristos } 1958*3117ece4Schristos static size_t 1959*3117ece4Schristos ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, 1960*3117ece4Schristos const void* seqStart, size_t seqSize, int nbSeq, 1961*3117ece4Schristos const ZSTD_longOffset_e isLongOffset) 1962*3117ece4Schristos { 1963*3117ece4Schristos DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer"); 1964*3117ece4Schristos #if DYNAMIC_BMI2 1965*3117ece4Schristos if (ZSTD_DCtx_get_bmi2(dctx)) { 1966*3117ece4Schristos return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1967*3117ece4Schristos } 1968*3117ece4Schristos #endif 1969*3117ece4Schristos return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1970*3117ece4Schristos } 1971*3117ece4Schristos #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ 1972*3117ece4Schristos 1973*3117ece4Schristos 1974*3117ece4Schristos #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT 1975*3117ece4Schristos /* ZSTD_decompressSequencesLong() : 1976*3117ece4Schristos * decompression function triggered when a minimum share of offsets is considered "long", 1977*3117ece4Schristos * aka out of cache. 1978*3117ece4Schristos * note : "long" definition seems overloaded here, sometimes meaning "wider than bitstream register", and sometimes meaning "farther than memory cache distance". 1979*3117ece4Schristos * This function will try to mitigate main memory latency through the use of prefetching */ 1980*3117ece4Schristos static size_t 1981*3117ece4Schristos ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx, 1982*3117ece4Schristos void* dst, size_t maxDstSize, 1983*3117ece4Schristos const void* seqStart, size_t seqSize, int nbSeq, 1984*3117ece4Schristos const ZSTD_longOffset_e isLongOffset) 1985*3117ece4Schristos { 1986*3117ece4Schristos DEBUGLOG(5, "ZSTD_decompressSequencesLong"); 1987*3117ece4Schristos #if DYNAMIC_BMI2 1988*3117ece4Schristos if (ZSTD_DCtx_get_bmi2(dctx)) { 1989*3117ece4Schristos return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1990*3117ece4Schristos } 1991*3117ece4Schristos #endif 1992*3117ece4Schristos return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1993*3117ece4Schristos } 1994*3117ece4Schristos #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ 1995*3117ece4Schristos 1996*3117ece4Schristos 1997*3117ece4Schristos /** 1998*3117ece4Schristos * @returns The total size of the history referenceable by zstd, including 1999*3117ece4Schristos * both the prefix and the extDict. At @p op any offset larger than this 2000*3117ece4Schristos * is invalid. 2001*3117ece4Schristos */ 2002*3117ece4Schristos static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart) 2003*3117ece4Schristos { 2004*3117ece4Schristos return (size_t)(op - virtualStart); 2005*3117ece4Schristos } 2006*3117ece4Schristos 2007*3117ece4Schristos typedef struct { 2008*3117ece4Schristos unsigned longOffsetShare; 2009*3117ece4Schristos unsigned maxNbAdditionalBits; 2010*3117ece4Schristos } ZSTD_OffsetInfo; 2011*3117ece4Schristos 2012*3117ece4Schristos /* ZSTD_getOffsetInfo() : 2013*3117ece4Schristos * condition : offTable must be valid 2014*3117ece4Schristos * @return : "share" of long offsets (arbitrarily defined as > (1<<23)) 2015*3117ece4Schristos * compared to maximum possible of (1<<OffFSELog), 2016*3117ece4Schristos * as well as the maximum number additional bits required. 2017*3117ece4Schristos */ 2018*3117ece4Schristos static ZSTD_OffsetInfo 2019*3117ece4Schristos ZSTD_getOffsetInfo(const ZSTD_seqSymbol* offTable, int nbSeq) 2020*3117ece4Schristos { 2021*3117ece4Schristos ZSTD_OffsetInfo info = {0, 0}; 2022*3117ece4Schristos /* If nbSeq == 0, then the offTable is uninitialized, but we have 2023*3117ece4Schristos * no sequences, so both values should be 0. 2024*3117ece4Schristos */ 2025*3117ece4Schristos if (nbSeq != 0) { 2026*3117ece4Schristos const void* ptr = offTable; 2027*3117ece4Schristos U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog; 2028*3117ece4Schristos const ZSTD_seqSymbol* table = offTable + 1; 2029*3117ece4Schristos U32 const max = 1 << tableLog; 2030*3117ece4Schristos U32 u; 2031*3117ece4Schristos DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog); 2032*3117ece4Schristos 2033*3117ece4Schristos assert(max <= (1 << OffFSELog)); /* max not too large */ 2034*3117ece4Schristos for (u=0; u<max; u++) { 2035*3117ece4Schristos info.maxNbAdditionalBits = MAX(info.maxNbAdditionalBits, table[u].nbAdditionalBits); 2036*3117ece4Schristos if (table[u].nbAdditionalBits > 22) info.longOffsetShare += 1; 2037*3117ece4Schristos } 2038*3117ece4Schristos 2039*3117ece4Schristos assert(tableLog <= OffFSELog); 2040*3117ece4Schristos info.longOffsetShare <<= (OffFSELog - tableLog); /* scale to OffFSELog */ 2041*3117ece4Schristos } 2042*3117ece4Schristos 2043*3117ece4Schristos return info; 2044*3117ece4Schristos } 2045*3117ece4Schristos 2046*3117ece4Schristos /** 2047*3117ece4Schristos * @returns The maximum offset we can decode in one read of our bitstream, without 2048*3117ece4Schristos * reloading more bits in the middle of the offset bits read. Any offsets larger 2049*3117ece4Schristos * than this must use the long offset decoder. 2050*3117ece4Schristos */ 2051*3117ece4Schristos static size_t ZSTD_maxShortOffset(void) 2052*3117ece4Schristos { 2053*3117ece4Schristos if (MEM_64bits()) { 2054*3117ece4Schristos /* We can decode any offset without reloading bits. 2055*3117ece4Schristos * This might change if the max window size grows. 2056*3117ece4Schristos */ 2057*3117ece4Schristos ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31); 2058*3117ece4Schristos return (size_t)-1; 2059*3117ece4Schristos } else { 2060*3117ece4Schristos /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1. 2061*3117ece4Schristos * This offBase would require STREAM_ACCUMULATOR_MIN extra bits. 2062*3117ece4Schristos * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset. 2063*3117ece4Schristos */ 2064*3117ece4Schristos size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1; 2065*3117ece4Schristos size_t const maxOffset = maxOffbase - ZSTD_REP_NUM; 2066*3117ece4Schristos assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN); 2067*3117ece4Schristos return maxOffset; 2068*3117ece4Schristos } 2069*3117ece4Schristos } 2070*3117ece4Schristos 2071*3117ece4Schristos size_t 2072*3117ece4Schristos ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, 2073*3117ece4Schristos void* dst, size_t dstCapacity, 2074*3117ece4Schristos const void* src, size_t srcSize, const streaming_operation streaming) 2075*3117ece4Schristos { /* blockType == blockCompressed */ 2076*3117ece4Schristos const BYTE* ip = (const BYTE*)src; 2077*3117ece4Schristos DEBUGLOG(5, "ZSTD_decompressBlock_internal (cSize : %u)", (unsigned)srcSize); 2078*3117ece4Schristos 2079*3117ece4Schristos /* Note : the wording of the specification 2080*3117ece4Schristos * allows compressed block to be sized exactly ZSTD_blockSizeMax(dctx). 2081*3117ece4Schristos * This generally does not happen, as it makes little sense, 2082*3117ece4Schristos * since an uncompressed block would feature same size and have no decompression cost. 2083*3117ece4Schristos * Also, note that decoder from reference libzstd before < v1.5.4 2084*3117ece4Schristos * would consider this edge case as an error. 2085*3117ece4Schristos * As a consequence, avoid generating compressed blocks of size ZSTD_blockSizeMax(dctx) 2086*3117ece4Schristos * for broader compatibility with the deployed ecosystem of zstd decoders */ 2087*3117ece4Schristos RETURN_ERROR_IF(srcSize > ZSTD_blockSizeMax(dctx), srcSize_wrong, ""); 2088*3117ece4Schristos 2089*3117ece4Schristos /* Decode literals section */ 2090*3117ece4Schristos { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming); 2091*3117ece4Schristos DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize); 2092*3117ece4Schristos if (ZSTD_isError(litCSize)) return litCSize; 2093*3117ece4Schristos ip += litCSize; 2094*3117ece4Schristos srcSize -= litCSize; 2095*3117ece4Schristos } 2096*3117ece4Schristos 2097*3117ece4Schristos /* Build Decoding Tables */ 2098*3117ece4Schristos { 2099*3117ece4Schristos /* Compute the maximum block size, which must also work when !frame and fParams are unset. 2100*3117ece4Schristos * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t. 2101*3117ece4Schristos */ 2102*3117ece4Schristos size_t const blockSizeMax = MIN(dstCapacity, ZSTD_blockSizeMax(dctx)); 2103*3117ece4Schristos size_t const totalHistorySize = ZSTD_totalHistorySize(ZSTD_maybeNullPtrAdd((BYTE*)dst, blockSizeMax), (BYTE const*)dctx->virtualStart); 2104*3117ece4Schristos /* isLongOffset must be true if there are long offsets. 2105*3117ece4Schristos * Offsets are long if they are larger than ZSTD_maxShortOffset(). 2106*3117ece4Schristos * We don't expect that to be the case in 64-bit mode. 2107*3117ece4Schristos * 2108*3117ece4Schristos * We check here to see if our history is large enough to allow long offsets. 2109*3117ece4Schristos * If it isn't, then we can't possible have (valid) long offsets. If the offset 2110*3117ece4Schristos * is invalid, then it is okay to read it incorrectly. 2111*3117ece4Schristos * 2112*3117ece4Schristos * If isLongOffsets is true, then we will later check our decoding table to see 2113*3117ece4Schristos * if it is even possible to generate long offsets. 2114*3117ece4Schristos */ 2115*3117ece4Schristos ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset())); 2116*3117ece4Schristos /* These macros control at build-time which decompressor implementation 2117*3117ece4Schristos * we use. If neither is defined, we do some inspection and dispatch at 2118*3117ece4Schristos * runtime. 2119*3117ece4Schristos */ 2120*3117ece4Schristos #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ 2121*3117ece4Schristos !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) 2122*3117ece4Schristos int usePrefetchDecoder = dctx->ddictIsCold; 2123*3117ece4Schristos #else 2124*3117ece4Schristos /* Set to 1 to avoid computing offset info if we don't need to. 2125*3117ece4Schristos * Otherwise this value is ignored. 2126*3117ece4Schristos */ 2127*3117ece4Schristos int usePrefetchDecoder = 1; 2128*3117ece4Schristos #endif 2129*3117ece4Schristos int nbSeq; 2130*3117ece4Schristos size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize); 2131*3117ece4Schristos if (ZSTD_isError(seqHSize)) return seqHSize; 2132*3117ece4Schristos ip += seqHSize; 2133*3117ece4Schristos srcSize -= seqHSize; 2134*3117ece4Schristos 2135*3117ece4Schristos RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled"); 2136*3117ece4Schristos RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall, 2137*3117ece4Schristos "invalid dst"); 2138*3117ece4Schristos 2139*3117ece4Schristos /* If we could potentially have long offsets, or we might want to use the prefetch decoder, 2140*3117ece4Schristos * compute information about the share of long offsets, and the maximum nbAdditionalBits. 2141*3117ece4Schristos * NOTE: could probably use a larger nbSeq limit 2142*3117ece4Schristos */ 2143*3117ece4Schristos if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) { 2144*3117ece4Schristos ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq); 2145*3117ece4Schristos if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) { 2146*3117ece4Schristos /* If isLongOffset, but the maximum number of additional bits that we see in our table is small 2147*3117ece4Schristos * enough, then we know it is impossible to have too long an offset in this block, so we can 2148*3117ece4Schristos * use the regular offset decoder. 2149*3117ece4Schristos */ 2150*3117ece4Schristos isLongOffset = ZSTD_lo_isRegularOffset; 2151*3117ece4Schristos } 2152*3117ece4Schristos if (!usePrefetchDecoder) { 2153*3117ece4Schristos U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */ 2154*3117ece4Schristos usePrefetchDecoder = (info.longOffsetShare >= minShare); 2155*3117ece4Schristos } 2156*3117ece4Schristos } 2157*3117ece4Schristos 2158*3117ece4Schristos dctx->ddictIsCold = 0; 2159*3117ece4Schristos 2160*3117ece4Schristos #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ 2161*3117ece4Schristos !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) 2162*3117ece4Schristos if (usePrefetchDecoder) { 2163*3117ece4Schristos #else 2164*3117ece4Schristos (void)usePrefetchDecoder; 2165*3117ece4Schristos { 2166*3117ece4Schristos #endif 2167*3117ece4Schristos #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT 2168*3117ece4Schristos return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); 2169*3117ece4Schristos #endif 2170*3117ece4Schristos } 2171*3117ece4Schristos 2172*3117ece4Schristos #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG 2173*3117ece4Schristos /* else */ 2174*3117ece4Schristos if (dctx->litBufferLocation == ZSTD_split) 2175*3117ece4Schristos return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); 2176*3117ece4Schristos else 2177*3117ece4Schristos return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); 2178*3117ece4Schristos #endif 2179*3117ece4Schristos } 2180*3117ece4Schristos } 2181*3117ece4Schristos 2182*3117ece4Schristos 2183*3117ece4Schristos ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 2184*3117ece4Schristos void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize) 2185*3117ece4Schristos { 2186*3117ece4Schristos if (dst != dctx->previousDstEnd && dstSize > 0) { /* not contiguous */ 2187*3117ece4Schristos dctx->dictEnd = dctx->previousDstEnd; 2188*3117ece4Schristos dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart)); 2189*3117ece4Schristos dctx->prefixStart = dst; 2190*3117ece4Schristos dctx->previousDstEnd = dst; 2191*3117ece4Schristos } 2192*3117ece4Schristos } 2193*3117ece4Schristos 2194*3117ece4Schristos 2195*3117ece4Schristos size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx, 2196*3117ece4Schristos void* dst, size_t dstCapacity, 2197*3117ece4Schristos const void* src, size_t srcSize) 2198*3117ece4Schristos { 2199*3117ece4Schristos size_t dSize; 2200*3117ece4Schristos dctx->isFrameDecompression = 0; 2201*3117ece4Schristos ZSTD_checkContinuity(dctx, dst, dstCapacity); 2202*3117ece4Schristos dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, not_streaming); 2203*3117ece4Schristos FORWARD_IF_ERROR(dSize, ""); 2204*3117ece4Schristos dctx->previousDstEnd = (char*)dst + dSize; 2205*3117ece4Schristos return dSize; 2206*3117ece4Schristos } 2207*3117ece4Schristos 2208*3117ece4Schristos 2209*3117ece4Schristos /* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */ 2210*3117ece4Schristos size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, 2211*3117ece4Schristos void* dst, size_t dstCapacity, 2212*3117ece4Schristos const void* src, size_t srcSize) 2213*3117ece4Schristos { 2214*3117ece4Schristos return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize); 2215*3117ece4Schristos } 2216