1 /* 2 * Copyright (c) Meta Platforms, Inc. and affiliates. 3 * All rights reserved. 4 * 5 * This source code is licensed under both the BSD-style license (found in the 6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found 7 * in the COPYING file in the root directory of this source tree). 8 * You may select, at your option, one of the above-listed licenses. 9 */ 10 11 /* zstd_decompress_block : 12 * this module takes care of decompressing _compressed_ block */ 13 14 /*-******************************************************* 15 * Dependencies 16 *********************************************************/ 17 #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ 18 #include "../common/compiler.h" /* prefetch */ 19 #include "../common/cpu.h" /* bmi2 */ 20 #include "../common/mem.h" /* low level memory routines */ 21 #define FSE_STATIC_LINKING_ONLY 22 #include "../common/fse.h" 23 #include "../common/huf.h" 24 #include "../common/zstd_internal.h" 25 #include "zstd_decompress_internal.h" /* ZSTD_DCtx */ 26 #include "zstd_ddict.h" /* ZSTD_DDictDictContent */ 27 #include "zstd_decompress_block.h" 28 #include "../common/bits.h" /* ZSTD_highbit32 */ 29 30 /*_******************************************************* 31 * Macros 32 **********************************************************/ 33 34 /* These two optional macros force the use one way or another of the two 35 * ZSTD_decompressSequences implementations. You can't force in both directions 36 * at the same time. 37 */ 38 #if defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ 39 defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) 40 #error "Cannot force the use of the short and the long ZSTD_decompressSequences variants!" 41 #endif 42 43 44 /*_******************************************************* 45 * Memory operations 46 **********************************************************/ 47 static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); } 48 49 50 /*-************************************************************* 51 * Block decoding 52 ***************************************************************/ 53 54 static size_t ZSTD_blockSizeMax(ZSTD_DCtx const* dctx) 55 { 56 size_t const blockSizeMax = dctx->isFrameDecompression ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX; 57 assert(blockSizeMax <= ZSTD_BLOCKSIZE_MAX); 58 return blockSizeMax; 59 } 60 61 /*! ZSTD_getcBlockSize() : 62 * Provides the size of compressed block from block header `src` */ 63 size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, 64 blockProperties_t* bpPtr) 65 { 66 RETURN_ERROR_IF(srcSize < ZSTD_blockHeaderSize, srcSize_wrong, ""); 67 68 { U32 const cBlockHeader = MEM_readLE24(src); 69 U32 const cSize = cBlockHeader >> 3; 70 bpPtr->lastBlock = cBlockHeader & 1; 71 bpPtr->blockType = (blockType_e)((cBlockHeader >> 1) & 3); 72 bpPtr->origSize = cSize; /* only useful for RLE */ 73 if (bpPtr->blockType == bt_rle) return 1; 74 RETURN_ERROR_IF(bpPtr->blockType == bt_reserved, corruption_detected, ""); 75 return cSize; 76 } 77 } 78 79 /* Allocate buffer for literals, either overlapping current dst, or split between dst and litExtraBuffer, or stored entirely within litExtraBuffer */ 80 static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize, 81 const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately) 82 { 83 size_t const blockSizeMax = ZSTD_blockSizeMax(dctx); 84 assert(litSize <= blockSizeMax); 85 assert(dctx->isFrameDecompression || streaming == not_streaming); 86 assert(expectedWriteSize <= blockSizeMax); 87 if (streaming == not_streaming && dstCapacity > blockSizeMax + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) { 88 /* If we aren't streaming, we can just put the literals after the output 89 * of the current block. We don't need to worry about overwriting the 90 * extDict of our window, because it doesn't exist. 91 * So if we have space after the end of the block, just put it there. 92 */ 93 dctx->litBuffer = (BYTE*)dst + blockSizeMax + WILDCOPY_OVERLENGTH; 94 dctx->litBufferEnd = dctx->litBuffer + litSize; 95 dctx->litBufferLocation = ZSTD_in_dst; 96 } else if (litSize <= ZSTD_LITBUFFEREXTRASIZE) { 97 /* Literals fit entirely within the extra buffer, put them there to avoid 98 * having to split the literals. 99 */ 100 dctx->litBuffer = dctx->litExtraBuffer; 101 dctx->litBufferEnd = dctx->litBuffer + litSize; 102 dctx->litBufferLocation = ZSTD_not_in_dst; 103 } else { 104 assert(blockSizeMax > ZSTD_LITBUFFEREXTRASIZE); 105 /* Literals must be split between the output block and the extra lit 106 * buffer. We fill the extra lit buffer with the tail of the literals, 107 * and put the rest of the literals at the end of the block, with 108 * WILDCOPY_OVERLENGTH of buffer room to allow for overreads. 109 * This MUST not write more than our maxBlockSize beyond dst, because in 110 * streaming mode, that could overwrite part of our extDict window. 111 */ 112 if (splitImmediately) { 113 /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */ 114 dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; 115 dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE; 116 } else { 117 /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */ 118 dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize; 119 dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize; 120 } 121 dctx->litBufferLocation = ZSTD_split; 122 assert(dctx->litBufferEnd <= (BYTE*)dst + expectedWriteSize); 123 } 124 } 125 126 /*! ZSTD_decodeLiteralsBlock() : 127 * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored 128 * in the dstBuffer. If there is room to do so, it will be stored in full in the excess dst space after where the current 129 * block will be output. Otherwise it will be stored at the end of the current dst blockspace, with a small portion being 130 * stored in dctx->litExtraBuffer to help keep it "ahead" of the current output write. 131 * 132 * @return : nb of bytes read from src (< srcSize ) 133 * note : symbol not declared but exposed for fullbench */ 134 static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, 135 const void* src, size_t srcSize, /* note : srcSize < BLOCKSIZE */ 136 void* dst, size_t dstCapacity, const streaming_operation streaming) 137 { 138 DEBUGLOG(5, "ZSTD_decodeLiteralsBlock"); 139 RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, ""); 140 141 { const BYTE* const istart = (const BYTE*) src; 142 symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3); 143 size_t const blockSizeMax = ZSTD_blockSizeMax(dctx); 144 145 switch(litEncType) 146 { 147 case set_repeat: 148 DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block"); 149 RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted, ""); 150 ZSTD_FALLTHROUGH; 151 152 case set_compressed: 153 RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3"); 154 { size_t lhSize, litSize, litCSize; 155 U32 singleStream=0; 156 U32 const lhlCode = (istart[0] >> 2) & 3; 157 U32 const lhc = MEM_readLE32(istart); 158 size_t hufSuccess; 159 size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); 160 int const flags = 0 161 | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0) 162 | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0); 163 switch(lhlCode) 164 { 165 case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */ 166 /* 2 - 2 - 10 - 10 */ 167 singleStream = !lhlCode; 168 lhSize = 3; 169 litSize = (lhc >> 4) & 0x3FF; 170 litCSize = (lhc >> 14) & 0x3FF; 171 break; 172 case 2: 173 /* 2 - 2 - 14 - 14 */ 174 lhSize = 4; 175 litSize = (lhc >> 4) & 0x3FFF; 176 litCSize = lhc >> 18; 177 break; 178 case 3: 179 /* 2 - 2 - 18 - 18 */ 180 lhSize = 5; 181 litSize = (lhc >> 4) & 0x3FFFF; 182 litCSize = (lhc >> 22) + ((size_t)istart[4] << 10); 183 break; 184 } 185 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); 186 RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); 187 if (!singleStream) 188 RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong, 189 "Not enough literals (%zu) for the 4-streams mode (min %u)", 190 litSize, MIN_LITERALS_FOR_4_STREAMS); 191 RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, ""); 192 RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, ""); 193 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0); 194 195 /* prefetch huffman table if cold */ 196 if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) { 197 PREFETCH_AREA(dctx->HUFptr, sizeof(dctx->entropy.hufTable)); 198 } 199 200 if (litEncType==set_repeat) { 201 if (singleStream) { 202 hufSuccess = HUF_decompress1X_usingDTable( 203 dctx->litBuffer, litSize, istart+lhSize, litCSize, 204 dctx->HUFptr, flags); 205 } else { 206 assert(litSize >= MIN_LITERALS_FOR_4_STREAMS); 207 hufSuccess = HUF_decompress4X_usingDTable( 208 dctx->litBuffer, litSize, istart+lhSize, litCSize, 209 dctx->HUFptr, flags); 210 } 211 } else { 212 if (singleStream) { 213 #if defined(HUF_FORCE_DECOMPRESS_X2) 214 hufSuccess = HUF_decompress1X_DCtx_wksp( 215 dctx->entropy.hufTable, dctx->litBuffer, litSize, 216 istart+lhSize, litCSize, dctx->workspace, 217 sizeof(dctx->workspace), flags); 218 #else 219 hufSuccess = HUF_decompress1X1_DCtx_wksp( 220 dctx->entropy.hufTable, dctx->litBuffer, litSize, 221 istart+lhSize, litCSize, dctx->workspace, 222 sizeof(dctx->workspace), flags); 223 #endif 224 } else { 225 hufSuccess = HUF_decompress4X_hufOnly_wksp( 226 dctx->entropy.hufTable, dctx->litBuffer, litSize, 227 istart+lhSize, litCSize, dctx->workspace, 228 sizeof(dctx->workspace), flags); 229 } 230 } 231 if (dctx->litBufferLocation == ZSTD_split) 232 { 233 assert(litSize > ZSTD_LITBUFFEREXTRASIZE); 234 ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE); 235 ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE); 236 dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; 237 dctx->litBufferEnd -= WILDCOPY_OVERLENGTH; 238 assert(dctx->litBufferEnd <= (BYTE*)dst + blockSizeMax); 239 } 240 241 RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, ""); 242 243 dctx->litPtr = dctx->litBuffer; 244 dctx->litSize = litSize; 245 dctx->litEntropy = 1; 246 if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable; 247 return litCSize + lhSize; 248 } 249 250 case set_basic: 251 { size_t litSize, lhSize; 252 U32 const lhlCode = ((istart[0]) >> 2) & 3; 253 size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); 254 switch(lhlCode) 255 { 256 case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ 257 lhSize = 1; 258 litSize = istart[0] >> 3; 259 break; 260 case 1: 261 lhSize = 2; 262 litSize = MEM_readLE16(istart) >> 4; 263 break; 264 case 3: 265 lhSize = 3; 266 RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3"); 267 litSize = MEM_readLE24(istart) >> 4; 268 break; 269 } 270 271 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); 272 RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); 273 RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); 274 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); 275 if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */ 276 RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, ""); 277 if (dctx->litBufferLocation == ZSTD_split) 278 { 279 ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize - ZSTD_LITBUFFEREXTRASIZE); 280 ZSTD_memcpy(dctx->litExtraBuffer, istart + lhSize + litSize - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE); 281 } 282 else 283 { 284 ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize); 285 } 286 dctx->litPtr = dctx->litBuffer; 287 dctx->litSize = litSize; 288 return lhSize+litSize; 289 } 290 /* direct reference into compressed stream */ 291 dctx->litPtr = istart+lhSize; 292 dctx->litSize = litSize; 293 dctx->litBufferEnd = dctx->litPtr + litSize; 294 dctx->litBufferLocation = ZSTD_not_in_dst; 295 return lhSize+litSize; 296 } 297 298 case set_rle: 299 { U32 const lhlCode = ((istart[0]) >> 2) & 3; 300 size_t litSize, lhSize; 301 size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); 302 switch(lhlCode) 303 { 304 case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ 305 lhSize = 1; 306 litSize = istart[0] >> 3; 307 break; 308 case 1: 309 lhSize = 2; 310 RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3"); 311 litSize = MEM_readLE16(istart) >> 4; 312 break; 313 case 3: 314 lhSize = 3; 315 RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4"); 316 litSize = MEM_readLE24(istart) >> 4; 317 break; 318 } 319 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); 320 RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); 321 RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); 322 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); 323 if (dctx->litBufferLocation == ZSTD_split) 324 { 325 ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize - ZSTD_LITBUFFEREXTRASIZE); 326 ZSTD_memset(dctx->litExtraBuffer, istart[lhSize], ZSTD_LITBUFFEREXTRASIZE); 327 } 328 else 329 { 330 ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize); 331 } 332 dctx->litPtr = dctx->litBuffer; 333 dctx->litSize = litSize; 334 return lhSize+1; 335 } 336 default: 337 RETURN_ERROR(corruption_detected, "impossible"); 338 } 339 } 340 } 341 342 /* Hidden declaration for fullbench */ 343 size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx, 344 const void* src, size_t srcSize, 345 void* dst, size_t dstCapacity); 346 size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx, 347 const void* src, size_t srcSize, 348 void* dst, size_t dstCapacity) 349 { 350 dctx->isFrameDecompression = 0; 351 return ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, not_streaming); 352 } 353 354 /* Default FSE distribution tables. 355 * These are pre-calculated FSE decoding tables using default distributions as defined in specification : 356 * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions 357 * They were generated programmatically with following method : 358 * - start from default distributions, present in /lib/common/zstd_internal.h 359 * - generate tables normally, using ZSTD_buildFSETable() 360 * - printout the content of tables 361 * - pretify output, report below, test with fuzzer to ensure it's correct */ 362 363 /* Default FSE distribution table for Literal Lengths */ 364 static const ZSTD_seqSymbol LL_defaultDTable[(1<<LL_DEFAULTNORMLOG)+1] = { 365 { 1, 1, 1, LL_DEFAULTNORMLOG}, /* header : fastMode, tableLog */ 366 /* nextState, nbAddBits, nbBits, baseVal */ 367 { 0, 0, 4, 0}, { 16, 0, 4, 0}, 368 { 32, 0, 5, 1}, { 0, 0, 5, 3}, 369 { 0, 0, 5, 4}, { 0, 0, 5, 6}, 370 { 0, 0, 5, 7}, { 0, 0, 5, 9}, 371 { 0, 0, 5, 10}, { 0, 0, 5, 12}, 372 { 0, 0, 6, 14}, { 0, 1, 5, 16}, 373 { 0, 1, 5, 20}, { 0, 1, 5, 22}, 374 { 0, 2, 5, 28}, { 0, 3, 5, 32}, 375 { 0, 4, 5, 48}, { 32, 6, 5, 64}, 376 { 0, 7, 5, 128}, { 0, 8, 6, 256}, 377 { 0, 10, 6, 1024}, { 0, 12, 6, 4096}, 378 { 32, 0, 4, 0}, { 0, 0, 4, 1}, 379 { 0, 0, 5, 2}, { 32, 0, 5, 4}, 380 { 0, 0, 5, 5}, { 32, 0, 5, 7}, 381 { 0, 0, 5, 8}, { 32, 0, 5, 10}, 382 { 0, 0, 5, 11}, { 0, 0, 6, 13}, 383 { 32, 1, 5, 16}, { 0, 1, 5, 18}, 384 { 32, 1, 5, 22}, { 0, 2, 5, 24}, 385 { 32, 3, 5, 32}, { 0, 3, 5, 40}, 386 { 0, 6, 4, 64}, { 16, 6, 4, 64}, 387 { 32, 7, 5, 128}, { 0, 9, 6, 512}, 388 { 0, 11, 6, 2048}, { 48, 0, 4, 0}, 389 { 16, 0, 4, 1}, { 32, 0, 5, 2}, 390 { 32, 0, 5, 3}, { 32, 0, 5, 5}, 391 { 32, 0, 5, 6}, { 32, 0, 5, 8}, 392 { 32, 0, 5, 9}, { 32, 0, 5, 11}, 393 { 32, 0, 5, 12}, { 0, 0, 6, 15}, 394 { 32, 1, 5, 18}, { 32, 1, 5, 20}, 395 { 32, 2, 5, 24}, { 32, 2, 5, 28}, 396 { 32, 3, 5, 40}, { 32, 4, 5, 48}, 397 { 0, 16, 6,65536}, { 0, 15, 6,32768}, 398 { 0, 14, 6,16384}, { 0, 13, 6, 8192}, 399 }; /* LL_defaultDTable */ 400 401 /* Default FSE distribution table for Offset Codes */ 402 static const ZSTD_seqSymbol OF_defaultDTable[(1<<OF_DEFAULTNORMLOG)+1] = { 403 { 1, 1, 1, OF_DEFAULTNORMLOG}, /* header : fastMode, tableLog */ 404 /* nextState, nbAddBits, nbBits, baseVal */ 405 { 0, 0, 5, 0}, { 0, 6, 4, 61}, 406 { 0, 9, 5, 509}, { 0, 15, 5,32765}, 407 { 0, 21, 5,2097149}, { 0, 3, 5, 5}, 408 { 0, 7, 4, 125}, { 0, 12, 5, 4093}, 409 { 0, 18, 5,262141}, { 0, 23, 5,8388605}, 410 { 0, 5, 5, 29}, { 0, 8, 4, 253}, 411 { 0, 14, 5,16381}, { 0, 20, 5,1048573}, 412 { 0, 2, 5, 1}, { 16, 7, 4, 125}, 413 { 0, 11, 5, 2045}, { 0, 17, 5,131069}, 414 { 0, 22, 5,4194301}, { 0, 4, 5, 13}, 415 { 16, 8, 4, 253}, { 0, 13, 5, 8189}, 416 { 0, 19, 5,524285}, { 0, 1, 5, 1}, 417 { 16, 6, 4, 61}, { 0, 10, 5, 1021}, 418 { 0, 16, 5,65533}, { 0, 28, 5,268435453}, 419 { 0, 27, 5,134217725}, { 0, 26, 5,67108861}, 420 { 0, 25, 5,33554429}, { 0, 24, 5,16777213}, 421 }; /* OF_defaultDTable */ 422 423 424 /* Default FSE distribution table for Match Lengths */ 425 static const ZSTD_seqSymbol ML_defaultDTable[(1<<ML_DEFAULTNORMLOG)+1] = { 426 { 1, 1, 1, ML_DEFAULTNORMLOG}, /* header : fastMode, tableLog */ 427 /* nextState, nbAddBits, nbBits, baseVal */ 428 { 0, 0, 6, 3}, { 0, 0, 4, 4}, 429 { 32, 0, 5, 5}, { 0, 0, 5, 6}, 430 { 0, 0, 5, 8}, { 0, 0, 5, 9}, 431 { 0, 0, 5, 11}, { 0, 0, 6, 13}, 432 { 0, 0, 6, 16}, { 0, 0, 6, 19}, 433 { 0, 0, 6, 22}, { 0, 0, 6, 25}, 434 { 0, 0, 6, 28}, { 0, 0, 6, 31}, 435 { 0, 0, 6, 34}, { 0, 1, 6, 37}, 436 { 0, 1, 6, 41}, { 0, 2, 6, 47}, 437 { 0, 3, 6, 59}, { 0, 4, 6, 83}, 438 { 0, 7, 6, 131}, { 0, 9, 6, 515}, 439 { 16, 0, 4, 4}, { 0, 0, 4, 5}, 440 { 32, 0, 5, 6}, { 0, 0, 5, 7}, 441 { 32, 0, 5, 9}, { 0, 0, 5, 10}, 442 { 0, 0, 6, 12}, { 0, 0, 6, 15}, 443 { 0, 0, 6, 18}, { 0, 0, 6, 21}, 444 { 0, 0, 6, 24}, { 0, 0, 6, 27}, 445 { 0, 0, 6, 30}, { 0, 0, 6, 33}, 446 { 0, 1, 6, 35}, { 0, 1, 6, 39}, 447 { 0, 2, 6, 43}, { 0, 3, 6, 51}, 448 { 0, 4, 6, 67}, { 0, 5, 6, 99}, 449 { 0, 8, 6, 259}, { 32, 0, 4, 4}, 450 { 48, 0, 4, 4}, { 16, 0, 4, 5}, 451 { 32, 0, 5, 7}, { 32, 0, 5, 8}, 452 { 32, 0, 5, 10}, { 32, 0, 5, 11}, 453 { 0, 0, 6, 14}, { 0, 0, 6, 17}, 454 { 0, 0, 6, 20}, { 0, 0, 6, 23}, 455 { 0, 0, 6, 26}, { 0, 0, 6, 29}, 456 { 0, 0, 6, 32}, { 0, 16, 6,65539}, 457 { 0, 15, 6,32771}, { 0, 14, 6,16387}, 458 { 0, 13, 6, 8195}, { 0, 12, 6, 4099}, 459 { 0, 11, 6, 2051}, { 0, 10, 6, 1027}, 460 }; /* ML_defaultDTable */ 461 462 463 static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U8 nbAddBits) 464 { 465 void* ptr = dt; 466 ZSTD_seqSymbol_header* const DTableH = (ZSTD_seqSymbol_header*)ptr; 467 ZSTD_seqSymbol* const cell = dt + 1; 468 469 DTableH->tableLog = 0; 470 DTableH->fastMode = 0; 471 472 cell->nbBits = 0; 473 cell->nextState = 0; 474 assert(nbAddBits < 255); 475 cell->nbAdditionalBits = nbAddBits; 476 cell->baseValue = baseValue; 477 } 478 479 480 /* ZSTD_buildFSETable() : 481 * generate FSE decoding table for one symbol (ll, ml or off) 482 * cannot fail if input is valid => 483 * all inputs are presumed validated at this stage */ 484 FORCE_INLINE_TEMPLATE 485 void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, 486 const short* normalizedCounter, unsigned maxSymbolValue, 487 const U32* baseValue, const U8* nbAdditionalBits, 488 unsigned tableLog, void* wksp, size_t wkspSize) 489 { 490 ZSTD_seqSymbol* const tableDecode = dt+1; 491 U32 const maxSV1 = maxSymbolValue + 1; 492 U32 const tableSize = 1 << tableLog; 493 494 U16* symbolNext = (U16*)wksp; 495 BYTE* spread = (BYTE*)(symbolNext + MaxSeq + 1); 496 U32 highThreshold = tableSize - 1; 497 498 499 /* Sanity Checks */ 500 assert(maxSymbolValue <= MaxSeq); 501 assert(tableLog <= MaxFSELog); 502 assert(wkspSize >= ZSTD_BUILD_FSE_TABLE_WKSP_SIZE); 503 (void)wkspSize; 504 /* Init, lay down lowprob symbols */ 505 { ZSTD_seqSymbol_header DTableH; 506 DTableH.tableLog = tableLog; 507 DTableH.fastMode = 1; 508 { S16 const largeLimit= (S16)(1 << (tableLog-1)); 509 U32 s; 510 for (s=0; s<maxSV1; s++) { 511 if (normalizedCounter[s]==-1) { 512 tableDecode[highThreshold--].baseValue = s; 513 symbolNext[s] = 1; 514 } else { 515 if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0; 516 assert(normalizedCounter[s]>=0); 517 symbolNext[s] = (U16)normalizedCounter[s]; 518 } } } 519 ZSTD_memcpy(dt, &DTableH, sizeof(DTableH)); 520 } 521 522 /* Spread symbols */ 523 assert(tableSize <= 512); 524 /* Specialized symbol spreading for the case when there are 525 * no low probability (-1 count) symbols. When compressing 526 * small blocks we avoid low probability symbols to hit this 527 * case, since header decoding speed matters more. 528 */ 529 if (highThreshold == tableSize - 1) { 530 size_t const tableMask = tableSize-1; 531 size_t const step = FSE_TABLESTEP(tableSize); 532 /* First lay down the symbols in order. 533 * We use a uint64_t to lay down 8 bytes at a time. This reduces branch 534 * misses since small blocks generally have small table logs, so nearly 535 * all symbols have counts <= 8. We ensure we have 8 bytes at the end of 536 * our buffer to handle the over-write. 537 */ 538 { 539 U64 const add = 0x0101010101010101ull; 540 size_t pos = 0; 541 U64 sv = 0; 542 U32 s; 543 for (s=0; s<maxSV1; ++s, sv += add) { 544 int i; 545 int const n = normalizedCounter[s]; 546 MEM_write64(spread + pos, sv); 547 for (i = 8; i < n; i += 8) { 548 MEM_write64(spread + pos + i, sv); 549 } 550 assert(n>=0); 551 pos += (size_t)n; 552 } 553 } 554 /* Now we spread those positions across the table. 555 * The benefit of doing it in two stages is that we avoid the 556 * variable size inner loop, which caused lots of branch misses. 557 * Now we can run through all the positions without any branch misses. 558 * We unroll the loop twice, since that is what empirically worked best. 559 */ 560 { 561 size_t position = 0; 562 size_t s; 563 size_t const unroll = 2; 564 assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */ 565 for (s = 0; s < (size_t)tableSize; s += unroll) { 566 size_t u; 567 for (u = 0; u < unroll; ++u) { 568 size_t const uPosition = (position + (u * step)) & tableMask; 569 tableDecode[uPosition].baseValue = spread[s + u]; 570 } 571 position = (position + (unroll * step)) & tableMask; 572 } 573 assert(position == 0); 574 } 575 } else { 576 U32 const tableMask = tableSize-1; 577 U32 const step = FSE_TABLESTEP(tableSize); 578 U32 s, position = 0; 579 for (s=0; s<maxSV1; s++) { 580 int i; 581 int const n = normalizedCounter[s]; 582 for (i=0; i<n; i++) { 583 tableDecode[position].baseValue = s; 584 position = (position + step) & tableMask; 585 while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask; /* lowprob area */ 586 } } 587 assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */ 588 } 589 590 /* Build Decoding table */ 591 { 592 U32 u; 593 for (u=0; u<tableSize; u++) { 594 U32 const symbol = tableDecode[u].baseValue; 595 U32 const nextState = symbolNext[symbol]++; 596 tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) ); 597 tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize); 598 assert(nbAdditionalBits[symbol] < 255); 599 tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol]; 600 tableDecode[u].baseValue = baseValue[symbol]; 601 } 602 } 603 } 604 605 /* Avoids the FORCE_INLINE of the _body() function. */ 606 static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt, 607 const short* normalizedCounter, unsigned maxSymbolValue, 608 const U32* baseValue, const U8* nbAdditionalBits, 609 unsigned tableLog, void* wksp, size_t wkspSize) 610 { 611 ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue, 612 baseValue, nbAdditionalBits, tableLog, wksp, wkspSize); 613 } 614 615 #if DYNAMIC_BMI2 616 BMI2_TARGET_ATTRIBUTE static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt, 617 const short* normalizedCounter, unsigned maxSymbolValue, 618 const U32* baseValue, const U8* nbAdditionalBits, 619 unsigned tableLog, void* wksp, size_t wkspSize) 620 { 621 ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue, 622 baseValue, nbAdditionalBits, tableLog, wksp, wkspSize); 623 } 624 #endif 625 626 void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, 627 const short* normalizedCounter, unsigned maxSymbolValue, 628 const U32* baseValue, const U8* nbAdditionalBits, 629 unsigned tableLog, void* wksp, size_t wkspSize, int bmi2) 630 { 631 #if DYNAMIC_BMI2 632 if (bmi2) { 633 ZSTD_buildFSETable_body_bmi2(dt, normalizedCounter, maxSymbolValue, 634 baseValue, nbAdditionalBits, tableLog, wksp, wkspSize); 635 return; 636 } 637 #endif 638 (void)bmi2; 639 ZSTD_buildFSETable_body_default(dt, normalizedCounter, maxSymbolValue, 640 baseValue, nbAdditionalBits, tableLog, wksp, wkspSize); 641 } 642 643 644 /*! ZSTD_buildSeqTable() : 645 * @return : nb bytes read from src, 646 * or an error code if it fails */ 647 static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr, 648 symbolEncodingType_e type, unsigned max, U32 maxLog, 649 const void* src, size_t srcSize, 650 const U32* baseValue, const U8* nbAdditionalBits, 651 const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable, 652 int ddictIsCold, int nbSeq, U32* wksp, size_t wkspSize, 653 int bmi2) 654 { 655 switch(type) 656 { 657 case set_rle : 658 RETURN_ERROR_IF(!srcSize, srcSize_wrong, ""); 659 RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected, ""); 660 { U32 const symbol = *(const BYTE*)src; 661 U32 const baseline = baseValue[symbol]; 662 U8 const nbBits = nbAdditionalBits[symbol]; 663 ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits); 664 } 665 *DTablePtr = DTableSpace; 666 return 1; 667 case set_basic : 668 *DTablePtr = defaultTable; 669 return 0; 670 case set_repeat: 671 RETURN_ERROR_IF(!flagRepeatTable, corruption_detected, ""); 672 /* prefetch FSE table if used */ 673 if (ddictIsCold && (nbSeq > 24 /* heuristic */)) { 674 const void* const pStart = *DTablePtr; 675 size_t const pSize = sizeof(ZSTD_seqSymbol) * (SEQSYMBOL_TABLE_SIZE(maxLog)); 676 PREFETCH_AREA(pStart, pSize); 677 } 678 return 0; 679 case set_compressed : 680 { unsigned tableLog; 681 S16 norm[MaxSeq+1]; 682 size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize); 683 RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected, ""); 684 RETURN_ERROR_IF(tableLog > maxLog, corruption_detected, ""); 685 ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog, wksp, wkspSize, bmi2); 686 *DTablePtr = DTableSpace; 687 return headerSize; 688 } 689 default : 690 assert(0); 691 RETURN_ERROR(GENERIC, "impossible"); 692 } 693 } 694 695 size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, 696 const void* src, size_t srcSize) 697 { 698 const BYTE* const istart = (const BYTE*)src; 699 const BYTE* const iend = istart + srcSize; 700 const BYTE* ip = istart; 701 int nbSeq; 702 DEBUGLOG(5, "ZSTD_decodeSeqHeaders"); 703 704 /* check */ 705 RETURN_ERROR_IF(srcSize < MIN_SEQUENCES_SIZE, srcSize_wrong, ""); 706 707 /* SeqHead */ 708 nbSeq = *ip++; 709 if (nbSeq > 0x7F) { 710 if (nbSeq == 0xFF) { 711 RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, ""); 712 nbSeq = MEM_readLE16(ip) + LONGNBSEQ; 713 ip+=2; 714 } else { 715 RETURN_ERROR_IF(ip >= iend, srcSize_wrong, ""); 716 nbSeq = ((nbSeq-0x80)<<8) + *ip++; 717 } 718 } 719 *nbSeqPtr = nbSeq; 720 721 if (nbSeq == 0) { 722 /* No sequence : section ends immediately */ 723 RETURN_ERROR_IF(ip != iend, corruption_detected, 724 "extraneous data present in the Sequences section"); 725 return (size_t)(ip - istart); 726 } 727 728 /* FSE table descriptors */ 729 RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */ 730 RETURN_ERROR_IF(*ip & 3, corruption_detected, ""); /* The last field, Reserved, must be all-zeroes. */ 731 { symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6); 732 symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3); 733 symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3); 734 ip++; 735 736 /* Build DTables */ 737 { size_t const llhSize = ZSTD_buildSeqTable(dctx->entropy.LLTable, &dctx->LLTptr, 738 LLtype, MaxLL, LLFSELog, 739 ip, iend-ip, 740 LL_base, LL_bits, 741 LL_defaultDTable, dctx->fseEntropy, 742 dctx->ddictIsCold, nbSeq, 743 dctx->workspace, sizeof(dctx->workspace), 744 ZSTD_DCtx_get_bmi2(dctx)); 745 RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed"); 746 ip += llhSize; 747 } 748 749 { size_t const ofhSize = ZSTD_buildSeqTable(dctx->entropy.OFTable, &dctx->OFTptr, 750 OFtype, MaxOff, OffFSELog, 751 ip, iend-ip, 752 OF_base, OF_bits, 753 OF_defaultDTable, dctx->fseEntropy, 754 dctx->ddictIsCold, nbSeq, 755 dctx->workspace, sizeof(dctx->workspace), 756 ZSTD_DCtx_get_bmi2(dctx)); 757 RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed"); 758 ip += ofhSize; 759 } 760 761 { size_t const mlhSize = ZSTD_buildSeqTable(dctx->entropy.MLTable, &dctx->MLTptr, 762 MLtype, MaxML, MLFSELog, 763 ip, iend-ip, 764 ML_base, ML_bits, 765 ML_defaultDTable, dctx->fseEntropy, 766 dctx->ddictIsCold, nbSeq, 767 dctx->workspace, sizeof(dctx->workspace), 768 ZSTD_DCtx_get_bmi2(dctx)); 769 RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed"); 770 ip += mlhSize; 771 } 772 } 773 774 return ip-istart; 775 } 776 777 778 typedef struct { 779 size_t litLength; 780 size_t matchLength; 781 size_t offset; 782 } seq_t; 783 784 typedef struct { 785 size_t state; 786 const ZSTD_seqSymbol* table; 787 } ZSTD_fseState; 788 789 typedef struct { 790 BIT_DStream_t DStream; 791 ZSTD_fseState stateLL; 792 ZSTD_fseState stateOffb; 793 ZSTD_fseState stateML; 794 size_t prevOffset[ZSTD_REP_NUM]; 795 } seqState_t; 796 797 /*! ZSTD_overlapCopy8() : 798 * Copies 8 bytes from ip to op and updates op and ip where ip <= op. 799 * If the offset is < 8 then the offset is spread to at least 8 bytes. 800 * 801 * Precondition: *ip <= *op 802 * Postcondition: *op - *op >= 8 803 */ 804 HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) { 805 assert(*ip <= *op); 806 if (offset < 8) { 807 /* close range match, overlap */ 808 static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; /* added */ 809 static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 }; /* subtracted */ 810 int const sub2 = dec64table[offset]; 811 (*op)[0] = (*ip)[0]; 812 (*op)[1] = (*ip)[1]; 813 (*op)[2] = (*ip)[2]; 814 (*op)[3] = (*ip)[3]; 815 *ip += dec32table[offset]; 816 ZSTD_copy4(*op+4, *ip); 817 *ip -= sub2; 818 } else { 819 ZSTD_copy8(*op, *ip); 820 } 821 *ip += 8; 822 *op += 8; 823 assert(*op - *ip >= 8); 824 } 825 826 /*! ZSTD_safecopy() : 827 * Specialized version of memcpy() that is allowed to READ up to WILDCOPY_OVERLENGTH past the input buffer 828 * and write up to 16 bytes past oend_w (op >= oend_w is allowed). 829 * This function is only called in the uncommon case where the sequence is near the end of the block. It 830 * should be fast for a single long sequence, but can be slow for several short sequences. 831 * 832 * @param ovtype controls the overlap detection 833 * - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart. 834 * - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart. 835 * The src buffer must be before the dst buffer. 836 */ 837 static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) { 838 ptrdiff_t const diff = op - ip; 839 BYTE* const oend = op + length; 840 841 assert((ovtype == ZSTD_no_overlap && (diff <= -8 || diff >= 8 || op >= oend_w)) || 842 (ovtype == ZSTD_overlap_src_before_dst && diff >= 0)); 843 844 if (length < 8) { 845 /* Handle short lengths. */ 846 while (op < oend) *op++ = *ip++; 847 return; 848 } 849 if (ovtype == ZSTD_overlap_src_before_dst) { 850 /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */ 851 assert(length >= 8); 852 ZSTD_overlapCopy8(&op, &ip, diff); 853 length -= 8; 854 assert(op - ip >= 8); 855 assert(op <= oend); 856 } 857 858 if (oend <= oend_w) { 859 /* No risk of overwrite. */ 860 ZSTD_wildcopy(op, ip, length, ovtype); 861 return; 862 } 863 if (op <= oend_w) { 864 /* Wildcopy until we get close to the end. */ 865 assert(oend > oend_w); 866 ZSTD_wildcopy(op, ip, oend_w - op, ovtype); 867 ip += oend_w - op; 868 op += oend_w - op; 869 } 870 /* Handle the leftovers. */ 871 while (op < oend) *op++ = *ip++; 872 } 873 874 /* ZSTD_safecopyDstBeforeSrc(): 875 * This version allows overlap with dst before src, or handles the non-overlap case with dst after src 876 * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */ 877 static void ZSTD_safecopyDstBeforeSrc(BYTE* op, const BYTE* ip, ptrdiff_t length) { 878 ptrdiff_t const diff = op - ip; 879 BYTE* const oend = op + length; 880 881 if (length < 8 || diff > -8) { 882 /* Handle short lengths, close overlaps, and dst not before src. */ 883 while (op < oend) *op++ = *ip++; 884 return; 885 } 886 887 if (op <= oend - WILDCOPY_OVERLENGTH && diff < -WILDCOPY_VECLEN) { 888 ZSTD_wildcopy(op, ip, oend - WILDCOPY_OVERLENGTH - op, ZSTD_no_overlap); 889 ip += oend - WILDCOPY_OVERLENGTH - op; 890 op += oend - WILDCOPY_OVERLENGTH - op; 891 } 892 893 /* Handle the leftovers. */ 894 while (op < oend) *op++ = *ip++; 895 } 896 897 /* ZSTD_execSequenceEnd(): 898 * This version handles cases that are near the end of the output buffer. It requires 899 * more careful checks to make sure there is no overflow. By separating out these hard 900 * and unlikely cases, we can speed up the common cases. 901 * 902 * NOTE: This function needs to be fast for a single long sequence, but doesn't need 903 * to be optimized for many small sequences, since those fall into ZSTD_execSequence(). 904 */ 905 FORCE_NOINLINE 906 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 907 size_t ZSTD_execSequenceEnd(BYTE* op, 908 BYTE* const oend, seq_t sequence, 909 const BYTE** litPtr, const BYTE* const litLimit, 910 const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) 911 { 912 BYTE* const oLitEnd = op + sequence.litLength; 913 size_t const sequenceLength = sequence.litLength + sequence.matchLength; 914 const BYTE* const iLitEnd = *litPtr + sequence.litLength; 915 const BYTE* match = oLitEnd - sequence.offset; 916 BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; 917 918 /* bounds checks : careful of address space overflow in 32-bit mode */ 919 RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer"); 920 RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer"); 921 assert(op < op + sequenceLength); 922 assert(oLitEnd < op + sequenceLength); 923 924 /* copy literals */ 925 ZSTD_safecopy(op, oend_w, *litPtr, sequence.litLength, ZSTD_no_overlap); 926 op = oLitEnd; 927 *litPtr = iLitEnd; 928 929 /* copy Match */ 930 if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { 931 /* offset beyond prefix */ 932 RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, ""); 933 match = dictEnd - (prefixStart - match); 934 if (match + sequence.matchLength <= dictEnd) { 935 ZSTD_memmove(oLitEnd, match, sequence.matchLength); 936 return sequenceLength; 937 } 938 /* span extDict & currentPrefixSegment */ 939 { size_t const length1 = dictEnd - match; 940 ZSTD_memmove(oLitEnd, match, length1); 941 op = oLitEnd + length1; 942 sequence.matchLength -= length1; 943 match = prefixStart; 944 } 945 } 946 ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst); 947 return sequenceLength; 948 } 949 950 /* ZSTD_execSequenceEndSplitLitBuffer(): 951 * This version is intended to be used during instances where the litBuffer is still split. It is kept separate to avoid performance impact for the good case. 952 */ 953 FORCE_NOINLINE 954 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 955 size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op, 956 BYTE* const oend, const BYTE* const oend_w, seq_t sequence, 957 const BYTE** litPtr, const BYTE* const litLimit, 958 const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) 959 { 960 BYTE* const oLitEnd = op + sequence.litLength; 961 size_t const sequenceLength = sequence.litLength + sequence.matchLength; 962 const BYTE* const iLitEnd = *litPtr + sequence.litLength; 963 const BYTE* match = oLitEnd - sequence.offset; 964 965 966 /* bounds checks : careful of address space overflow in 32-bit mode */ 967 RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer"); 968 RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer"); 969 assert(op < op + sequenceLength); 970 assert(oLitEnd < op + sequenceLength); 971 972 /* copy literals */ 973 RETURN_ERROR_IF(op > *litPtr && op < *litPtr + sequence.litLength, dstSize_tooSmall, "output should not catch up to and overwrite literal buffer"); 974 ZSTD_safecopyDstBeforeSrc(op, *litPtr, sequence.litLength); 975 op = oLitEnd; 976 *litPtr = iLitEnd; 977 978 /* copy Match */ 979 if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { 980 /* offset beyond prefix */ 981 RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, ""); 982 match = dictEnd - (prefixStart - match); 983 if (match + sequence.matchLength <= dictEnd) { 984 ZSTD_memmove(oLitEnd, match, sequence.matchLength); 985 return sequenceLength; 986 } 987 /* span extDict & currentPrefixSegment */ 988 { size_t const length1 = dictEnd - match; 989 ZSTD_memmove(oLitEnd, match, length1); 990 op = oLitEnd + length1; 991 sequence.matchLength -= length1; 992 match = prefixStart; 993 } 994 } 995 ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst); 996 return sequenceLength; 997 } 998 999 HINT_INLINE 1000 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 1001 size_t ZSTD_execSequence(BYTE* op, 1002 BYTE* const oend, seq_t sequence, 1003 const BYTE** litPtr, const BYTE* const litLimit, 1004 const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) 1005 { 1006 BYTE* const oLitEnd = op + sequence.litLength; 1007 size_t const sequenceLength = sequence.litLength + sequence.matchLength; 1008 BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ 1009 BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; /* risk : address space underflow on oend=NULL */ 1010 const BYTE* const iLitEnd = *litPtr + sequence.litLength; 1011 const BYTE* match = oLitEnd - sequence.offset; 1012 1013 assert(op != NULL /* Precondition */); 1014 assert(oend_w < oend /* No underflow */); 1015 1016 #if defined(__aarch64__) 1017 /* prefetch sequence starting from match that will be used for copy later */ 1018 PREFETCH_L1(match); 1019 #endif 1020 /* Handle edge cases in a slow path: 1021 * - Read beyond end of literals 1022 * - Match end is within WILDCOPY_OVERLIMIT of oend 1023 * - 32-bit mode and the match length overflows 1024 */ 1025 if (UNLIKELY( 1026 iLitEnd > litLimit || 1027 oMatchEnd > oend_w || 1028 (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH))) 1029 return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd); 1030 1031 /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */ 1032 assert(op <= oLitEnd /* No overflow */); 1033 assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */); 1034 assert(oMatchEnd <= oend /* No underflow */); 1035 assert(iLitEnd <= litLimit /* Literal length is in bounds */); 1036 assert(oLitEnd <= oend_w /* Can wildcopy literals */); 1037 assert(oMatchEnd <= oend_w /* Can wildcopy matches */); 1038 1039 /* Copy Literals: 1040 * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9. 1041 * We likely don't need the full 32-byte wildcopy. 1042 */ 1043 assert(WILDCOPY_OVERLENGTH >= 16); 1044 ZSTD_copy16(op, (*litPtr)); 1045 if (UNLIKELY(sequence.litLength > 16)) { 1046 ZSTD_wildcopy(op + 16, (*litPtr) + 16, sequence.litLength - 16, ZSTD_no_overlap); 1047 } 1048 op = oLitEnd; 1049 *litPtr = iLitEnd; /* update for next sequence */ 1050 1051 /* Copy Match */ 1052 if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { 1053 /* offset beyond prefix -> go into extDict */ 1054 RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, ""); 1055 match = dictEnd + (match - prefixStart); 1056 if (match + sequence.matchLength <= dictEnd) { 1057 ZSTD_memmove(oLitEnd, match, sequence.matchLength); 1058 return sequenceLength; 1059 } 1060 /* span extDict & currentPrefixSegment */ 1061 { size_t const length1 = dictEnd - match; 1062 ZSTD_memmove(oLitEnd, match, length1); 1063 op = oLitEnd + length1; 1064 sequence.matchLength -= length1; 1065 match = prefixStart; 1066 } 1067 } 1068 /* Match within prefix of 1 or more bytes */ 1069 assert(op <= oMatchEnd); 1070 assert(oMatchEnd <= oend_w); 1071 assert(match >= prefixStart); 1072 assert(sequence.matchLength >= 1); 1073 1074 /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy 1075 * without overlap checking. 1076 */ 1077 if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) { 1078 /* We bet on a full wildcopy for matches, since we expect matches to be 1079 * longer than literals (in general). In silesia, ~10% of matches are longer 1080 * than 16 bytes. 1081 */ 1082 ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap); 1083 return sequenceLength; 1084 } 1085 assert(sequence.offset < WILDCOPY_VECLEN); 1086 1087 /* Copy 8 bytes and spread the offset to be >= 8. */ 1088 ZSTD_overlapCopy8(&op, &match, sequence.offset); 1089 1090 /* If the match length is > 8 bytes, then continue with the wildcopy. */ 1091 if (sequence.matchLength > 8) { 1092 assert(op < oMatchEnd); 1093 ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8, ZSTD_overlap_src_before_dst); 1094 } 1095 return sequenceLength; 1096 } 1097 1098 HINT_INLINE 1099 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 1100 size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op, 1101 BYTE* const oend, const BYTE* const oend_w, seq_t sequence, 1102 const BYTE** litPtr, const BYTE* const litLimit, 1103 const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) 1104 { 1105 BYTE* const oLitEnd = op + sequence.litLength; 1106 size_t const sequenceLength = sequence.litLength + sequence.matchLength; 1107 BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ 1108 const BYTE* const iLitEnd = *litPtr + sequence.litLength; 1109 const BYTE* match = oLitEnd - sequence.offset; 1110 1111 assert(op != NULL /* Precondition */); 1112 assert(oend_w < oend /* No underflow */); 1113 /* Handle edge cases in a slow path: 1114 * - Read beyond end of literals 1115 * - Match end is within WILDCOPY_OVERLIMIT of oend 1116 * - 32-bit mode and the match length overflows 1117 */ 1118 if (UNLIKELY( 1119 iLitEnd > litLimit || 1120 oMatchEnd > oend_w || 1121 (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH))) 1122 return ZSTD_execSequenceEndSplitLitBuffer(op, oend, oend_w, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd); 1123 1124 /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */ 1125 assert(op <= oLitEnd /* No overflow */); 1126 assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */); 1127 assert(oMatchEnd <= oend /* No underflow */); 1128 assert(iLitEnd <= litLimit /* Literal length is in bounds */); 1129 assert(oLitEnd <= oend_w /* Can wildcopy literals */); 1130 assert(oMatchEnd <= oend_w /* Can wildcopy matches */); 1131 1132 /* Copy Literals: 1133 * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9. 1134 * We likely don't need the full 32-byte wildcopy. 1135 */ 1136 assert(WILDCOPY_OVERLENGTH >= 16); 1137 ZSTD_copy16(op, (*litPtr)); 1138 if (UNLIKELY(sequence.litLength > 16)) { 1139 ZSTD_wildcopy(op+16, (*litPtr)+16, sequence.litLength-16, ZSTD_no_overlap); 1140 } 1141 op = oLitEnd; 1142 *litPtr = iLitEnd; /* update for next sequence */ 1143 1144 /* Copy Match */ 1145 if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { 1146 /* offset beyond prefix -> go into extDict */ 1147 RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, ""); 1148 match = dictEnd + (match - prefixStart); 1149 if (match + sequence.matchLength <= dictEnd) { 1150 ZSTD_memmove(oLitEnd, match, sequence.matchLength); 1151 return sequenceLength; 1152 } 1153 /* span extDict & currentPrefixSegment */ 1154 { size_t const length1 = dictEnd - match; 1155 ZSTD_memmove(oLitEnd, match, length1); 1156 op = oLitEnd + length1; 1157 sequence.matchLength -= length1; 1158 match = prefixStart; 1159 } } 1160 /* Match within prefix of 1 or more bytes */ 1161 assert(op <= oMatchEnd); 1162 assert(oMatchEnd <= oend_w); 1163 assert(match >= prefixStart); 1164 assert(sequence.matchLength >= 1); 1165 1166 /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy 1167 * without overlap checking. 1168 */ 1169 if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) { 1170 /* We bet on a full wildcopy for matches, since we expect matches to be 1171 * longer than literals (in general). In silesia, ~10% of matches are longer 1172 * than 16 bytes. 1173 */ 1174 ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap); 1175 return sequenceLength; 1176 } 1177 assert(sequence.offset < WILDCOPY_VECLEN); 1178 1179 /* Copy 8 bytes and spread the offset to be >= 8. */ 1180 ZSTD_overlapCopy8(&op, &match, sequence.offset); 1181 1182 /* If the match length is > 8 bytes, then continue with the wildcopy. */ 1183 if (sequence.matchLength > 8) { 1184 assert(op < oMatchEnd); 1185 ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8, ZSTD_overlap_src_before_dst); 1186 } 1187 return sequenceLength; 1188 } 1189 1190 1191 static void 1192 ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt) 1193 { 1194 const void* ptr = dt; 1195 const ZSTD_seqSymbol_header* const DTableH = (const ZSTD_seqSymbol_header*)ptr; 1196 DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog); 1197 DEBUGLOG(6, "ZSTD_initFseState : val=%u using %u bits", 1198 (U32)DStatePtr->state, DTableH->tableLog); 1199 BIT_reloadDStream(bitD); 1200 DStatePtr->table = dt + 1; 1201 } 1202 1203 FORCE_INLINE_TEMPLATE void 1204 ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 nextState, U32 nbBits) 1205 { 1206 size_t const lowBits = BIT_readBits(bitD, nbBits); 1207 DStatePtr->state = nextState + lowBits; 1208 } 1209 1210 /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum 1211 * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32 1212 * bits before reloading. This value is the maximum number of bytes we read 1213 * after reloading when we are decoding long offsets. 1214 */ 1215 #define LONG_OFFSETS_MAX_EXTRA_BITS_32 \ 1216 (ZSTD_WINDOWLOG_MAX_32 > STREAM_ACCUMULATOR_MIN_32 \ 1217 ? ZSTD_WINDOWLOG_MAX_32 - STREAM_ACCUMULATOR_MIN_32 \ 1218 : 0) 1219 1220 typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e; 1221 1222 /** 1223 * ZSTD_decodeSequence(): 1224 * @p longOffsets : tells the decoder to reload more bit while decoding large offsets 1225 * only used in 32-bit mode 1226 * @return : Sequence (litL + matchL + offset) 1227 */ 1228 FORCE_INLINE_TEMPLATE seq_t 1229 ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const int isLastSeq) 1230 { 1231 seq_t seq; 1232 /* 1233 * ZSTD_seqSymbol is a 64 bits wide structure. 1234 * It can be loaded in one operation 1235 * and its fields extracted by simply shifting or bit-extracting on aarch64. 1236 * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh 1237 * operations that cause performance drop. This can be avoided by using this 1238 * ZSTD_memcpy hack. 1239 */ 1240 #if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__)) 1241 ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS; 1242 ZSTD_seqSymbol* const llDInfo = &llDInfoS; 1243 ZSTD_seqSymbol* const mlDInfo = &mlDInfoS; 1244 ZSTD_seqSymbol* const ofDInfo = &ofDInfoS; 1245 ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol)); 1246 ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol)); 1247 ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol)); 1248 #else 1249 const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state; 1250 const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state; 1251 const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state; 1252 #endif 1253 seq.matchLength = mlDInfo->baseValue; 1254 seq.litLength = llDInfo->baseValue; 1255 { U32 const ofBase = ofDInfo->baseValue; 1256 BYTE const llBits = llDInfo->nbAdditionalBits; 1257 BYTE const mlBits = mlDInfo->nbAdditionalBits; 1258 BYTE const ofBits = ofDInfo->nbAdditionalBits; 1259 BYTE const totalBits = llBits+mlBits+ofBits; 1260 1261 U16 const llNext = llDInfo->nextState; 1262 U16 const mlNext = mlDInfo->nextState; 1263 U16 const ofNext = ofDInfo->nextState; 1264 U32 const llnbBits = llDInfo->nbBits; 1265 U32 const mlnbBits = mlDInfo->nbBits; 1266 U32 const ofnbBits = ofDInfo->nbBits; 1267 1268 assert(llBits <= MaxLLBits); 1269 assert(mlBits <= MaxMLBits); 1270 assert(ofBits <= MaxOff); 1271 /* 1272 * As gcc has better branch and block analyzers, sometimes it is only 1273 * valuable to mark likeliness for clang, it gives around 3-4% of 1274 * performance. 1275 */ 1276 1277 /* sequence */ 1278 { size_t offset; 1279 if (ofBits > 1) { 1280 ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); 1281 ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); 1282 ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32); 1283 ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits); 1284 if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { 1285 /* Always read extra bits, this keeps the logic simple, 1286 * avoids branches, and avoids accidentally reading 0 bits. 1287 */ 1288 U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32; 1289 offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); 1290 BIT_reloadDStream(&seqState->DStream); 1291 offset += BIT_readBitsFast(&seqState->DStream, extraBits); 1292 } else { 1293 offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ 1294 if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); 1295 } 1296 seqState->prevOffset[2] = seqState->prevOffset[1]; 1297 seqState->prevOffset[1] = seqState->prevOffset[0]; 1298 seqState->prevOffset[0] = offset; 1299 } else { 1300 U32 const ll0 = (llDInfo->baseValue == 0); 1301 if (LIKELY((ofBits == 0))) { 1302 offset = seqState->prevOffset[ll0]; 1303 seqState->prevOffset[1] = seqState->prevOffset[!ll0]; 1304 seqState->prevOffset[0] = offset; 1305 } else { 1306 offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1); 1307 { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; 1308 temp -= !temp; /* 0 is not valid: input corrupted => force offset to -1 => corruption detected at execSequence */ 1309 if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; 1310 seqState->prevOffset[1] = seqState->prevOffset[0]; 1311 seqState->prevOffset[0] = offset = temp; 1312 } } } 1313 seq.offset = offset; 1314 } 1315 1316 if (mlBits > 0) 1317 seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); 1318 1319 if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) 1320 BIT_reloadDStream(&seqState->DStream); 1321 if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog))) 1322 BIT_reloadDStream(&seqState->DStream); 1323 /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ 1324 ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); 1325 1326 if (llBits > 0) 1327 seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); 1328 1329 if (MEM_32bits()) 1330 BIT_reloadDStream(&seqState->DStream); 1331 1332 DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", 1333 (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); 1334 1335 if (!isLastSeq) { 1336 /* don't update FSE state for last Sequence */ 1337 ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */ 1338 ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */ 1339 if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ 1340 ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */ 1341 BIT_reloadDStream(&seqState->DStream); 1342 } 1343 } 1344 1345 return seq; 1346 } 1347 1348 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1349 #if DEBUGLEVEL >= 1 1350 static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd) 1351 { 1352 size_t const windowSize = dctx->fParams.windowSize; 1353 /* No dictionary used. */ 1354 if (dctx->dictContentEndForFuzzing == NULL) return 0; 1355 /* Dictionary is our prefix. */ 1356 if (prefixStart == dctx->dictContentBeginForFuzzing) return 1; 1357 /* Dictionary is not our ext-dict. */ 1358 if (dctx->dictEnd != dctx->dictContentEndForFuzzing) return 0; 1359 /* Dictionary is not within our window size. */ 1360 if ((size_t)(oLitEnd - prefixStart) >= windowSize) return 0; 1361 /* Dictionary is active. */ 1362 return 1; 1363 } 1364 #endif 1365 1366 static void ZSTD_assertValidSequence( 1367 ZSTD_DCtx const* dctx, 1368 BYTE const* op, BYTE const* oend, 1369 seq_t const seq, 1370 BYTE const* prefixStart, BYTE const* virtualStart) 1371 { 1372 #if DEBUGLEVEL >= 1 1373 if (dctx->isFrameDecompression) { 1374 size_t const windowSize = dctx->fParams.windowSize; 1375 size_t const sequenceSize = seq.litLength + seq.matchLength; 1376 BYTE const* const oLitEnd = op + seq.litLength; 1377 DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u", 1378 (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); 1379 assert(op <= oend); 1380 assert((size_t)(oend - op) >= sequenceSize); 1381 assert(sequenceSize <= ZSTD_blockSizeMax(dctx)); 1382 if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) { 1383 size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing); 1384 /* Offset must be within the dictionary. */ 1385 assert(seq.offset <= (size_t)(oLitEnd - virtualStart)); 1386 assert(seq.offset <= windowSize + dictSize); 1387 } else { 1388 /* Offset must be within our window. */ 1389 assert(seq.offset <= windowSize); 1390 } 1391 } 1392 #else 1393 (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart; 1394 #endif 1395 } 1396 #endif 1397 1398 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG 1399 1400 1401 FORCE_INLINE_TEMPLATE size_t 1402 DONT_VECTORIZE 1403 ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, 1404 void* dst, size_t maxDstSize, 1405 const void* seqStart, size_t seqSize, int nbSeq, 1406 const ZSTD_longOffset_e isLongOffset) 1407 { 1408 const BYTE* ip = (const BYTE*)seqStart; 1409 const BYTE* const iend = ip + seqSize; 1410 BYTE* const ostart = (BYTE*)dst; 1411 BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, maxDstSize); 1412 BYTE* op = ostart; 1413 const BYTE* litPtr = dctx->litPtr; 1414 const BYTE* litBufferEnd = dctx->litBufferEnd; 1415 const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); 1416 const BYTE* const vBase = (const BYTE*) (dctx->virtualStart); 1417 const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); 1418 DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer (%i seqs)", nbSeq); 1419 1420 /* Literals are split between internal buffer & output buffer */ 1421 if (nbSeq) { 1422 seqState_t seqState; 1423 dctx->fseEntropy = 1; 1424 { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; } 1425 RETURN_ERROR_IF( 1426 ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)), 1427 corruption_detected, ""); 1428 ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); 1429 ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); 1430 ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); 1431 assert(dst != NULL); 1432 1433 ZSTD_STATIC_ASSERT( 1434 BIT_DStream_unfinished < BIT_DStream_completed && 1435 BIT_DStream_endOfBuffer < BIT_DStream_completed && 1436 BIT_DStream_completed < BIT_DStream_overflow); 1437 1438 /* decompress without overrunning litPtr begins */ 1439 { seq_t sequence = {0,0,0}; /* some static analyzer believe that @sequence is not initialized (it necessarily is, since for(;;) loop as at least one iteration) */ 1440 /* Align the decompression loop to 32 + 16 bytes. 1441 * 1442 * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression 1443 * speed swings based on the alignment of the decompression loop. This 1444 * performance swing is caused by parts of the decompression loop falling 1445 * out of the DSB. The entire decompression loop should fit in the DSB, 1446 * when it can't we get much worse performance. You can measure if you've 1447 * hit the good case or the bad case with this perf command for some 1448 * compressed file test.zst: 1449 * 1450 * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \ 1451 * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst 1452 * 1453 * If you see most cycles served out of the MITE you've hit the bad case. 1454 * If you see most cycles served out of the DSB you've hit the good case. 1455 * If it is pretty even then you may be in an okay case. 1456 * 1457 * This issue has been reproduced on the following CPUs: 1458 * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9 1459 * Use Instruments->Counters to get DSB/MITE cycles. 1460 * I never got performance swings, but I was able to 1461 * go from the good case of mostly DSB to half of the 1462 * cycles served from MITE. 1463 * - Coffeelake: Intel i9-9900k 1464 * - Coffeelake: Intel i7-9700k 1465 * 1466 * I haven't been able to reproduce the instability or DSB misses on any 1467 * of the following CPUS: 1468 * - Haswell 1469 * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH 1470 * - Skylake 1471 * 1472 * Alignment is done for each of the three major decompression loops: 1473 * - ZSTD_decompressSequences_bodySplitLitBuffer - presplit section of the literal buffer 1474 * - ZSTD_decompressSequences_bodySplitLitBuffer - postsplit section of the literal buffer 1475 * - ZSTD_decompressSequences_body 1476 * Alignment choices are made to minimize large swings on bad cases and influence on performance 1477 * from changes external to this code, rather than to overoptimize on the current commit. 1478 * 1479 * If you are seeing performance stability this script can help test. 1480 * It tests on 4 commits in zstd where I saw performance change. 1481 * 1482 * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4 1483 */ 1484 #if defined(__GNUC__) && defined(__x86_64__) 1485 __asm__(".p2align 6"); 1486 # if __GNUC__ >= 7 1487 /* good for gcc-7, gcc-9, and gcc-11 */ 1488 __asm__("nop"); 1489 __asm__(".p2align 5"); 1490 __asm__("nop"); 1491 __asm__(".p2align 4"); 1492 # if __GNUC__ == 8 || __GNUC__ == 10 1493 /* good for gcc-8 and gcc-10 */ 1494 __asm__("nop"); 1495 __asm__(".p2align 3"); 1496 # endif 1497 # endif 1498 #endif 1499 1500 /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */ 1501 for ( ; nbSeq; nbSeq--) { 1502 sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); 1503 if (litPtr + sequence.litLength > dctx->litBufferEnd) break; 1504 { size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); 1505 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1506 assert(!ZSTD_isError(oneSeqSize)); 1507 ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); 1508 #endif 1509 if (UNLIKELY(ZSTD_isError(oneSeqSize))) 1510 return oneSeqSize; 1511 DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); 1512 op += oneSeqSize; 1513 } } 1514 DEBUGLOG(6, "reached: (litPtr + sequence.litLength > dctx->litBufferEnd)"); 1515 1516 /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */ 1517 if (nbSeq > 0) { 1518 const size_t leftoverLit = dctx->litBufferEnd - litPtr; 1519 DEBUGLOG(6, "There are %i sequences left, and %zu/%zu literals left in buffer", nbSeq, leftoverLit, sequence.litLength); 1520 if (leftoverLit) { 1521 RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); 1522 ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); 1523 sequence.litLength -= leftoverLit; 1524 op += leftoverLit; 1525 } 1526 litPtr = dctx->litExtraBuffer; 1527 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; 1528 dctx->litBufferLocation = ZSTD_not_in_dst; 1529 { size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); 1530 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1531 assert(!ZSTD_isError(oneSeqSize)); 1532 ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); 1533 #endif 1534 if (UNLIKELY(ZSTD_isError(oneSeqSize))) 1535 return oneSeqSize; 1536 DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); 1537 op += oneSeqSize; 1538 } 1539 nbSeq--; 1540 } 1541 } 1542 1543 if (nbSeq > 0) { 1544 /* there is remaining lit from extra buffer */ 1545 1546 #if defined(__GNUC__) && defined(__x86_64__) 1547 __asm__(".p2align 6"); 1548 __asm__("nop"); 1549 # if __GNUC__ != 7 1550 /* worse for gcc-7 better for gcc-8, gcc-9, and gcc-10 and clang */ 1551 __asm__(".p2align 4"); 1552 __asm__("nop"); 1553 __asm__(".p2align 3"); 1554 # elif __GNUC__ >= 11 1555 __asm__(".p2align 3"); 1556 # else 1557 __asm__(".p2align 5"); 1558 __asm__("nop"); 1559 __asm__(".p2align 3"); 1560 # endif 1561 #endif 1562 1563 for ( ; nbSeq ; nbSeq--) { 1564 seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); 1565 size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); 1566 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1567 assert(!ZSTD_isError(oneSeqSize)); 1568 ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); 1569 #endif 1570 if (UNLIKELY(ZSTD_isError(oneSeqSize))) 1571 return oneSeqSize; 1572 DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); 1573 op += oneSeqSize; 1574 } 1575 } 1576 1577 /* check if reached exact end */ 1578 DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq); 1579 RETURN_ERROR_IF(nbSeq, corruption_detected, ""); 1580 DEBUGLOG(5, "bitStream : start=%p, ptr=%p, bitsConsumed=%u", seqState.DStream.start, seqState.DStream.ptr, seqState.DStream.bitsConsumed); 1581 RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); 1582 /* save reps for next block */ 1583 { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); } 1584 } 1585 1586 /* last literal segment */ 1587 if (dctx->litBufferLocation == ZSTD_split) { 1588 /* split hasn't been reached yet, first get dst then copy litExtraBuffer */ 1589 size_t const lastLLSize = (size_t)(litBufferEnd - litPtr); 1590 DEBUGLOG(6, "copy last literals from segment : %u", (U32)lastLLSize); 1591 RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); 1592 if (op != NULL) { 1593 ZSTD_memmove(op, litPtr, lastLLSize); 1594 op += lastLLSize; 1595 } 1596 litPtr = dctx->litExtraBuffer; 1597 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; 1598 dctx->litBufferLocation = ZSTD_not_in_dst; 1599 } 1600 /* copy last literals from internal buffer */ 1601 { size_t const lastLLSize = (size_t)(litBufferEnd - litPtr); 1602 DEBUGLOG(6, "copy last literals from internal buffer : %u", (U32)lastLLSize); 1603 RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); 1604 if (op != NULL) { 1605 ZSTD_memcpy(op, litPtr, lastLLSize); 1606 op += lastLLSize; 1607 } } 1608 1609 DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart)); 1610 return (size_t)(op - ostart); 1611 } 1612 1613 FORCE_INLINE_TEMPLATE size_t 1614 DONT_VECTORIZE 1615 ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, 1616 void* dst, size_t maxDstSize, 1617 const void* seqStart, size_t seqSize, int nbSeq, 1618 const ZSTD_longOffset_e isLongOffset) 1619 { 1620 const BYTE* ip = (const BYTE*)seqStart; 1621 const BYTE* const iend = ip + seqSize; 1622 BYTE* const ostart = (BYTE*)dst; 1623 BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ZSTD_maybeNullPtrAdd(ostart, maxDstSize) : dctx->litBuffer; 1624 BYTE* op = ostart; 1625 const BYTE* litPtr = dctx->litPtr; 1626 const BYTE* const litEnd = litPtr + dctx->litSize; 1627 const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart); 1628 const BYTE* const vBase = (const BYTE*)(dctx->virtualStart); 1629 const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd); 1630 DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq); 1631 1632 /* Regen sequences */ 1633 if (nbSeq) { 1634 seqState_t seqState; 1635 dctx->fseEntropy = 1; 1636 { U32 i; for (i = 0; i < ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; } 1637 RETURN_ERROR_IF( 1638 ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend - ip)), 1639 corruption_detected, ""); 1640 ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); 1641 ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); 1642 ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); 1643 assert(dst != NULL); 1644 1645 #if defined(__GNUC__) && defined(__x86_64__) 1646 __asm__(".p2align 6"); 1647 __asm__("nop"); 1648 # if __GNUC__ >= 7 1649 __asm__(".p2align 5"); 1650 __asm__("nop"); 1651 __asm__(".p2align 3"); 1652 # else 1653 __asm__(".p2align 4"); 1654 __asm__("nop"); 1655 __asm__(".p2align 3"); 1656 # endif 1657 #endif 1658 1659 for ( ; nbSeq ; nbSeq--) { 1660 seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); 1661 size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd); 1662 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1663 assert(!ZSTD_isError(oneSeqSize)); 1664 ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); 1665 #endif 1666 if (UNLIKELY(ZSTD_isError(oneSeqSize))) 1667 return oneSeqSize; 1668 DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); 1669 op += oneSeqSize; 1670 } 1671 1672 /* check if reached exact end */ 1673 assert(nbSeq == 0); 1674 RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); 1675 /* save reps for next block */ 1676 { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); } 1677 } 1678 1679 /* last literal segment */ 1680 { size_t const lastLLSize = (size_t)(litEnd - litPtr); 1681 DEBUGLOG(6, "copy last literals : %u", (U32)lastLLSize); 1682 RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); 1683 if (op != NULL) { 1684 ZSTD_memcpy(op, litPtr, lastLLSize); 1685 op += lastLLSize; 1686 } } 1687 1688 DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart)); 1689 return (size_t)(op - ostart); 1690 } 1691 1692 static size_t 1693 ZSTD_decompressSequences_default(ZSTD_DCtx* dctx, 1694 void* dst, size_t maxDstSize, 1695 const void* seqStart, size_t seqSize, int nbSeq, 1696 const ZSTD_longOffset_e isLongOffset) 1697 { 1698 return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1699 } 1700 1701 static size_t 1702 ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx, 1703 void* dst, size_t maxDstSize, 1704 const void* seqStart, size_t seqSize, int nbSeq, 1705 const ZSTD_longOffset_e isLongOffset) 1706 { 1707 return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1708 } 1709 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ 1710 1711 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT 1712 1713 FORCE_INLINE_TEMPLATE 1714 1715 size_t ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence, 1716 const BYTE* const prefixStart, const BYTE* const dictEnd) 1717 { 1718 prefetchPos += sequence.litLength; 1719 { const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart; 1720 /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. 1721 * No consequence though : memory address is only used for prefetching, not for dereferencing */ 1722 const BYTE* const match = ZSTD_wrappedPtrSub(ZSTD_wrappedPtrAdd(matchBase, prefetchPos), sequence.offset); 1723 PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */ 1724 } 1725 return prefetchPos + sequence.matchLength; 1726 } 1727 1728 /* This decoding function employs prefetching 1729 * to reduce latency impact of cache misses. 1730 * It's generally employed when block contains a significant portion of long-distance matches 1731 * or when coupled with a "cold" dictionary */ 1732 FORCE_INLINE_TEMPLATE size_t 1733 ZSTD_decompressSequencesLong_body( 1734 ZSTD_DCtx* dctx, 1735 void* dst, size_t maxDstSize, 1736 const void* seqStart, size_t seqSize, int nbSeq, 1737 const ZSTD_longOffset_e isLongOffset) 1738 { 1739 const BYTE* ip = (const BYTE*)seqStart; 1740 const BYTE* const iend = ip + seqSize; 1741 BYTE* const ostart = (BYTE*)dst; 1742 BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ZSTD_maybeNullPtrAdd(ostart, maxDstSize); 1743 BYTE* op = ostart; 1744 const BYTE* litPtr = dctx->litPtr; 1745 const BYTE* litBufferEnd = dctx->litBufferEnd; 1746 const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); 1747 const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart); 1748 const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); 1749 1750 /* Regen sequences */ 1751 if (nbSeq) { 1752 #define STORED_SEQS 8 1753 #define STORED_SEQS_MASK (STORED_SEQS-1) 1754 #define ADVANCED_SEQS STORED_SEQS 1755 seq_t sequences[STORED_SEQS]; 1756 int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS); 1757 seqState_t seqState; 1758 int seqNb; 1759 size_t prefetchPos = (size_t)(op-prefixStart); /* track position relative to prefixStart */ 1760 1761 dctx->fseEntropy = 1; 1762 { int i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; } 1763 assert(dst != NULL); 1764 assert(iend >= ip); 1765 RETURN_ERROR_IF( 1766 ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)), 1767 corruption_detected, ""); 1768 ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); 1769 ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); 1770 ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); 1771 1772 /* prepare in advance */ 1773 for (seqNb=0; seqNb<seqAdvance; seqNb++) { 1774 seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, seqNb == nbSeq-1); 1775 prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); 1776 sequences[seqNb] = sequence; 1777 } 1778 1779 /* decompress without stomping litBuffer */ 1780 for (; seqNb < nbSeq; seqNb++) { 1781 seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset, seqNb == nbSeq-1); 1782 1783 if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) { 1784 /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */ 1785 const size_t leftoverLit = dctx->litBufferEnd - litPtr; 1786 if (leftoverLit) 1787 { 1788 RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); 1789 ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); 1790 sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength -= leftoverLit; 1791 op += leftoverLit; 1792 } 1793 litPtr = dctx->litExtraBuffer; 1794 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; 1795 dctx->litBufferLocation = ZSTD_not_in_dst; 1796 { size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); 1797 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1798 assert(!ZSTD_isError(oneSeqSize)); 1799 ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); 1800 #endif 1801 if (ZSTD_isError(oneSeqSize)) return oneSeqSize; 1802 1803 prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); 1804 sequences[seqNb & STORED_SEQS_MASK] = sequence; 1805 op += oneSeqSize; 1806 } } 1807 else 1808 { 1809 /* lit buffer is either wholly contained in first or second split, or not split at all*/ 1810 size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ? 1811 ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) : 1812 ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); 1813 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1814 assert(!ZSTD_isError(oneSeqSize)); 1815 ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); 1816 #endif 1817 if (ZSTD_isError(oneSeqSize)) return oneSeqSize; 1818 1819 prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); 1820 sequences[seqNb & STORED_SEQS_MASK] = sequence; 1821 op += oneSeqSize; 1822 } 1823 } 1824 RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); 1825 1826 /* finish queue */ 1827 seqNb -= seqAdvance; 1828 for ( ; seqNb<nbSeq ; seqNb++) { 1829 seq_t *sequence = &(sequences[seqNb&STORED_SEQS_MASK]); 1830 if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) { 1831 const size_t leftoverLit = dctx->litBufferEnd - litPtr; 1832 if (leftoverLit) { 1833 RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); 1834 ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); 1835 sequence->litLength -= leftoverLit; 1836 op += leftoverLit; 1837 } 1838 litPtr = dctx->litExtraBuffer; 1839 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; 1840 dctx->litBufferLocation = ZSTD_not_in_dst; 1841 { size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); 1842 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1843 assert(!ZSTD_isError(oneSeqSize)); 1844 ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); 1845 #endif 1846 if (ZSTD_isError(oneSeqSize)) return oneSeqSize; 1847 op += oneSeqSize; 1848 } 1849 } 1850 else 1851 { 1852 size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ? 1853 ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence->litLength - WILDCOPY_OVERLENGTH, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) : 1854 ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); 1855 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1856 assert(!ZSTD_isError(oneSeqSize)); 1857 ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); 1858 #endif 1859 if (ZSTD_isError(oneSeqSize)) return oneSeqSize; 1860 op += oneSeqSize; 1861 } 1862 } 1863 1864 /* save reps for next block */ 1865 { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); } 1866 } 1867 1868 /* last literal segment */ 1869 if (dctx->litBufferLocation == ZSTD_split) { /* first deplete literal buffer in dst, then copy litExtraBuffer */ 1870 size_t const lastLLSize = litBufferEnd - litPtr; 1871 RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); 1872 if (op != NULL) { 1873 ZSTD_memmove(op, litPtr, lastLLSize); 1874 op += lastLLSize; 1875 } 1876 litPtr = dctx->litExtraBuffer; 1877 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; 1878 } 1879 { size_t const lastLLSize = litBufferEnd - litPtr; 1880 RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); 1881 if (op != NULL) { 1882 ZSTD_memmove(op, litPtr, lastLLSize); 1883 op += lastLLSize; 1884 } 1885 } 1886 1887 return (size_t)(op - ostart); 1888 } 1889 1890 static size_t 1891 ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx, 1892 void* dst, size_t maxDstSize, 1893 const void* seqStart, size_t seqSize, int nbSeq, 1894 const ZSTD_longOffset_e isLongOffset) 1895 { 1896 return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1897 } 1898 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ 1899 1900 1901 1902 #if DYNAMIC_BMI2 1903 1904 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG 1905 static BMI2_TARGET_ATTRIBUTE size_t 1906 DONT_VECTORIZE 1907 ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx, 1908 void* dst, size_t maxDstSize, 1909 const void* seqStart, size_t seqSize, int nbSeq, 1910 const ZSTD_longOffset_e isLongOffset) 1911 { 1912 return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1913 } 1914 static BMI2_TARGET_ATTRIBUTE size_t 1915 DONT_VECTORIZE 1916 ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx, 1917 void* dst, size_t maxDstSize, 1918 const void* seqStart, size_t seqSize, int nbSeq, 1919 const ZSTD_longOffset_e isLongOffset) 1920 { 1921 return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1922 } 1923 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ 1924 1925 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT 1926 static BMI2_TARGET_ATTRIBUTE size_t 1927 ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx, 1928 void* dst, size_t maxDstSize, 1929 const void* seqStart, size_t seqSize, int nbSeq, 1930 const ZSTD_longOffset_e isLongOffset) 1931 { 1932 return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1933 } 1934 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ 1935 1936 #endif /* DYNAMIC_BMI2 */ 1937 1938 typedef size_t (*ZSTD_decompressSequences_t)( 1939 ZSTD_DCtx* dctx, 1940 void* dst, size_t maxDstSize, 1941 const void* seqStart, size_t seqSize, int nbSeq, 1942 const ZSTD_longOffset_e isLongOffset); 1943 1944 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG 1945 static size_t 1946 ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, 1947 const void* seqStart, size_t seqSize, int nbSeq, 1948 const ZSTD_longOffset_e isLongOffset) 1949 { 1950 DEBUGLOG(5, "ZSTD_decompressSequences"); 1951 #if DYNAMIC_BMI2 1952 if (ZSTD_DCtx_get_bmi2(dctx)) { 1953 return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1954 } 1955 #endif 1956 return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1957 } 1958 static size_t 1959 ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, 1960 const void* seqStart, size_t seqSize, int nbSeq, 1961 const ZSTD_longOffset_e isLongOffset) 1962 { 1963 DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer"); 1964 #if DYNAMIC_BMI2 1965 if (ZSTD_DCtx_get_bmi2(dctx)) { 1966 return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1967 } 1968 #endif 1969 return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1970 } 1971 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ 1972 1973 1974 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT 1975 /* ZSTD_decompressSequencesLong() : 1976 * decompression function triggered when a minimum share of offsets is considered "long", 1977 * aka out of cache. 1978 * note : "long" definition seems overloaded here, sometimes meaning "wider than bitstream register", and sometimes meaning "farther than memory cache distance". 1979 * This function will try to mitigate main memory latency through the use of prefetching */ 1980 static size_t 1981 ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx, 1982 void* dst, size_t maxDstSize, 1983 const void* seqStart, size_t seqSize, int nbSeq, 1984 const ZSTD_longOffset_e isLongOffset) 1985 { 1986 DEBUGLOG(5, "ZSTD_decompressSequencesLong"); 1987 #if DYNAMIC_BMI2 1988 if (ZSTD_DCtx_get_bmi2(dctx)) { 1989 return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1990 } 1991 #endif 1992 return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); 1993 } 1994 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ 1995 1996 1997 /** 1998 * @returns The total size of the history referenceable by zstd, including 1999 * both the prefix and the extDict. At @p op any offset larger than this 2000 * is invalid. 2001 */ 2002 static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart) 2003 { 2004 return (size_t)(op - virtualStart); 2005 } 2006 2007 typedef struct { 2008 unsigned longOffsetShare; 2009 unsigned maxNbAdditionalBits; 2010 } ZSTD_OffsetInfo; 2011 2012 /* ZSTD_getOffsetInfo() : 2013 * condition : offTable must be valid 2014 * @return : "share" of long offsets (arbitrarily defined as > (1<<23)) 2015 * compared to maximum possible of (1<<OffFSELog), 2016 * as well as the maximum number additional bits required. 2017 */ 2018 static ZSTD_OffsetInfo 2019 ZSTD_getOffsetInfo(const ZSTD_seqSymbol* offTable, int nbSeq) 2020 { 2021 ZSTD_OffsetInfo info = {0, 0}; 2022 /* If nbSeq == 0, then the offTable is uninitialized, but we have 2023 * no sequences, so both values should be 0. 2024 */ 2025 if (nbSeq != 0) { 2026 const void* ptr = offTable; 2027 U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog; 2028 const ZSTD_seqSymbol* table = offTable + 1; 2029 U32 const max = 1 << tableLog; 2030 U32 u; 2031 DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog); 2032 2033 assert(max <= (1 << OffFSELog)); /* max not too large */ 2034 for (u=0; u<max; u++) { 2035 info.maxNbAdditionalBits = MAX(info.maxNbAdditionalBits, table[u].nbAdditionalBits); 2036 if (table[u].nbAdditionalBits > 22) info.longOffsetShare += 1; 2037 } 2038 2039 assert(tableLog <= OffFSELog); 2040 info.longOffsetShare <<= (OffFSELog - tableLog); /* scale to OffFSELog */ 2041 } 2042 2043 return info; 2044 } 2045 2046 /** 2047 * @returns The maximum offset we can decode in one read of our bitstream, without 2048 * reloading more bits in the middle of the offset bits read. Any offsets larger 2049 * than this must use the long offset decoder. 2050 */ 2051 static size_t ZSTD_maxShortOffset(void) 2052 { 2053 if (MEM_64bits()) { 2054 /* We can decode any offset without reloading bits. 2055 * This might change if the max window size grows. 2056 */ 2057 ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31); 2058 return (size_t)-1; 2059 } else { 2060 /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1. 2061 * This offBase would require STREAM_ACCUMULATOR_MIN extra bits. 2062 * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset. 2063 */ 2064 size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1; 2065 size_t const maxOffset = maxOffbase - ZSTD_REP_NUM; 2066 assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN); 2067 return maxOffset; 2068 } 2069 } 2070 2071 size_t 2072 ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, 2073 void* dst, size_t dstCapacity, 2074 const void* src, size_t srcSize, const streaming_operation streaming) 2075 { /* blockType == blockCompressed */ 2076 const BYTE* ip = (const BYTE*)src; 2077 DEBUGLOG(5, "ZSTD_decompressBlock_internal (cSize : %u)", (unsigned)srcSize); 2078 2079 /* Note : the wording of the specification 2080 * allows compressed block to be sized exactly ZSTD_blockSizeMax(dctx). 2081 * This generally does not happen, as it makes little sense, 2082 * since an uncompressed block would feature same size and have no decompression cost. 2083 * Also, note that decoder from reference libzstd before < v1.5.4 2084 * would consider this edge case as an error. 2085 * As a consequence, avoid generating compressed blocks of size ZSTD_blockSizeMax(dctx) 2086 * for broader compatibility with the deployed ecosystem of zstd decoders */ 2087 RETURN_ERROR_IF(srcSize > ZSTD_blockSizeMax(dctx), srcSize_wrong, ""); 2088 2089 /* Decode literals section */ 2090 { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming); 2091 DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize); 2092 if (ZSTD_isError(litCSize)) return litCSize; 2093 ip += litCSize; 2094 srcSize -= litCSize; 2095 } 2096 2097 /* Build Decoding Tables */ 2098 { 2099 /* Compute the maximum block size, which must also work when !frame and fParams are unset. 2100 * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t. 2101 */ 2102 size_t const blockSizeMax = MIN(dstCapacity, ZSTD_blockSizeMax(dctx)); 2103 size_t const totalHistorySize = ZSTD_totalHistorySize(ZSTD_maybeNullPtrAdd((BYTE*)dst, blockSizeMax), (BYTE const*)dctx->virtualStart); 2104 /* isLongOffset must be true if there are long offsets. 2105 * Offsets are long if they are larger than ZSTD_maxShortOffset(). 2106 * We don't expect that to be the case in 64-bit mode. 2107 * 2108 * We check here to see if our history is large enough to allow long offsets. 2109 * If it isn't, then we can't possible have (valid) long offsets. If the offset 2110 * is invalid, then it is okay to read it incorrectly. 2111 * 2112 * If isLongOffsets is true, then we will later check our decoding table to see 2113 * if it is even possible to generate long offsets. 2114 */ 2115 ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset())); 2116 /* These macros control at build-time which decompressor implementation 2117 * we use. If neither is defined, we do some inspection and dispatch at 2118 * runtime. 2119 */ 2120 #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ 2121 !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) 2122 int usePrefetchDecoder = dctx->ddictIsCold; 2123 #else 2124 /* Set to 1 to avoid computing offset info if we don't need to. 2125 * Otherwise this value is ignored. 2126 */ 2127 int usePrefetchDecoder = 1; 2128 #endif 2129 int nbSeq; 2130 size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize); 2131 if (ZSTD_isError(seqHSize)) return seqHSize; 2132 ip += seqHSize; 2133 srcSize -= seqHSize; 2134 2135 RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled"); 2136 RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall, 2137 "invalid dst"); 2138 2139 /* If we could potentially have long offsets, or we might want to use the prefetch decoder, 2140 * compute information about the share of long offsets, and the maximum nbAdditionalBits. 2141 * NOTE: could probably use a larger nbSeq limit 2142 */ 2143 if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) { 2144 ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq); 2145 if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) { 2146 /* If isLongOffset, but the maximum number of additional bits that we see in our table is small 2147 * enough, then we know it is impossible to have too long an offset in this block, so we can 2148 * use the regular offset decoder. 2149 */ 2150 isLongOffset = ZSTD_lo_isRegularOffset; 2151 } 2152 if (!usePrefetchDecoder) { 2153 U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */ 2154 usePrefetchDecoder = (info.longOffsetShare >= minShare); 2155 } 2156 } 2157 2158 dctx->ddictIsCold = 0; 2159 2160 #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ 2161 !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) 2162 if (usePrefetchDecoder) { 2163 #else 2164 (void)usePrefetchDecoder; 2165 { 2166 #endif 2167 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT 2168 return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); 2169 #endif 2170 } 2171 2172 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG 2173 /* else */ 2174 if (dctx->litBufferLocation == ZSTD_split) 2175 return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); 2176 else 2177 return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); 2178 #endif 2179 } 2180 } 2181 2182 2183 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 2184 void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize) 2185 { 2186 if (dst != dctx->previousDstEnd && dstSize > 0) { /* not contiguous */ 2187 dctx->dictEnd = dctx->previousDstEnd; 2188 dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart)); 2189 dctx->prefixStart = dst; 2190 dctx->previousDstEnd = dst; 2191 } 2192 } 2193 2194 2195 size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx, 2196 void* dst, size_t dstCapacity, 2197 const void* src, size_t srcSize) 2198 { 2199 size_t dSize; 2200 dctx->isFrameDecompression = 0; 2201 ZSTD_checkContinuity(dctx, dst, dstCapacity); 2202 dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, not_streaming); 2203 FORWARD_IF_ERROR(dSize, ""); 2204 dctx->previousDstEnd = (char*)dst + dSize; 2205 return dSize; 2206 } 2207 2208 2209 /* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */ 2210 size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, 2211 void* dst, size_t dstCapacity, 2212 const void* src, size_t srcSize) 2213 { 2214 return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize); 2215 } 2216