1 /* ****************************************************************** 2 * huff0 huffman decoder, 3 * part of Finite State Entropy library 4 * Copyright (c) Meta Platforms, Inc. and affiliates. 5 * 6 * You can contact the author at : 7 * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy 8 * 9 * This source code is licensed under both the BSD-style license (found in the 10 * LICENSE file in the root directory of this source tree) and the GPLv2 (found 11 * in the COPYING file in the root directory of this source tree). 12 * You may select, at your option, one of the above-listed licenses. 13 ****************************************************************** */ 14 15 /* ************************************************************** 16 * Dependencies 17 ****************************************************************/ 18 #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memset */ 19 #include "../common/compiler.h" 20 #include "../common/bitstream.h" /* BIT_* */ 21 #include "../common/fse.h" /* to compress headers */ 22 #include "../common/huf.h" 23 #include "../common/error_private.h" 24 #include "../common/zstd_internal.h" 25 #include "../common/bits.h" /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */ 26 27 /* ************************************************************** 28 * Constants 29 ****************************************************************/ 30 31 #define HUF_DECODER_FAST_TABLELOG 11 32 33 /* ************************************************************** 34 * Macros 35 ****************************************************************/ 36 37 #ifdef HUF_DISABLE_FAST_DECODE 38 # define HUF_ENABLE_FAST_DECODE 0 39 #else 40 # define HUF_ENABLE_FAST_DECODE 1 41 #endif 42 43 /* These two optional macros force the use one way or another of the two 44 * Huffman decompression implementations. You can't force in both directions 45 * at the same time. 46 */ 47 #if defined(HUF_FORCE_DECOMPRESS_X1) && \ 48 defined(HUF_FORCE_DECOMPRESS_X2) 49 #error "Cannot force the use of the X1 and X2 decoders at the same time!" 50 #endif 51 52 /* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is 53 * supported at runtime, so we can add the BMI2 target attribute. 54 * When it is disabled, we will still get BMI2 if it is enabled statically. 55 */ 56 #if DYNAMIC_BMI2 57 # define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE 58 #else 59 # define HUF_FAST_BMI2_ATTRS 60 #endif 61 62 #ifdef __cplusplus 63 # define HUF_EXTERN_C extern "C" 64 #else 65 # define HUF_EXTERN_C 66 #endif 67 #define HUF_ASM_DECL HUF_EXTERN_C 68 69 #if DYNAMIC_BMI2 70 # define HUF_NEED_BMI2_FUNCTION 1 71 #else 72 # define HUF_NEED_BMI2_FUNCTION 0 73 #endif 74 75 /* ************************************************************** 76 * Error Management 77 ****************************************************************/ 78 #define HUF_isError ERR_isError 79 80 81 /* ************************************************************** 82 * Byte alignment for workSpace management 83 ****************************************************************/ 84 #define HUF_ALIGN(x, a) HUF_ALIGN_MASK((x), (a) - 1) 85 #define HUF_ALIGN_MASK(x, mask) (((x) + (mask)) & ~(mask)) 86 87 88 /* ************************************************************** 89 * BMI2 Variant Wrappers 90 ****************************************************************/ 91 typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize, 92 const void *cSrc, 93 size_t cSrcSize, 94 const HUF_DTable *DTable); 95 96 #if DYNAMIC_BMI2 97 98 #define HUF_DGEN(fn) \ 99 \ 100 static size_t fn##_default( \ 101 void* dst, size_t dstSize, \ 102 const void* cSrc, size_t cSrcSize, \ 103 const HUF_DTable* DTable) \ 104 { \ 105 return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \ 106 } \ 107 \ 108 static BMI2_TARGET_ATTRIBUTE size_t fn##_bmi2( \ 109 void* dst, size_t dstSize, \ 110 const void* cSrc, size_t cSrcSize, \ 111 const HUF_DTable* DTable) \ 112 { \ 113 return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \ 114 } \ 115 \ 116 static size_t fn(void* dst, size_t dstSize, void const* cSrc, \ 117 size_t cSrcSize, HUF_DTable const* DTable, int flags) \ 118 { \ 119 if (flags & HUF_flags_bmi2) { \ 120 return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); \ 121 } \ 122 return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable); \ 123 } 124 125 #else 126 127 #define HUF_DGEN(fn) \ 128 static size_t fn(void* dst, size_t dstSize, void const* cSrc, \ 129 size_t cSrcSize, HUF_DTable const* DTable, int flags) \ 130 { \ 131 (void)flags; \ 132 return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \ 133 } 134 135 #endif 136 137 138 /*-***************************/ 139 /* generic DTableDesc */ 140 /*-***************************/ 141 typedef struct { BYTE maxTableLog; BYTE tableType; BYTE tableLog; BYTE reserved; } DTableDesc; 142 143 static DTableDesc HUF_getDTableDesc(const HUF_DTable* table) 144 { 145 DTableDesc dtd; 146 ZSTD_memcpy(&dtd, table, sizeof(dtd)); 147 return dtd; 148 } 149 150 static size_t HUF_initFastDStream(BYTE const* ip) { 151 BYTE const lastByte = ip[7]; 152 size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; 153 size_t const value = MEM_readLEST(ip) | 1; 154 assert(bitsConsumed <= 8); 155 assert(sizeof(size_t) == 8); 156 return value << bitsConsumed; 157 } 158 159 160 /** 161 * The input/output arguments to the Huffman fast decoding loop: 162 * 163 * ip [in/out] - The input pointers, must be updated to reflect what is consumed. 164 * op [in/out] - The output pointers, must be updated to reflect what is written. 165 * bits [in/out] - The bitstream containers, must be updated to reflect the current state. 166 * dt [in] - The decoding table. 167 * ilowest [in] - The beginning of the valid range of the input. Decoders may read 168 * down to this pointer. It may be below iend[0]. 169 * oend [in] - The end of the output stream. op[3] must not cross oend. 170 * iend [in] - The end of each input stream. ip[i] may cross iend[i], 171 * as long as it is above ilowest, but that indicates corruption. 172 */ 173 typedef struct { 174 BYTE const* ip[4]; 175 BYTE* op[4]; 176 U64 bits[4]; 177 void const* dt; 178 BYTE const* ilowest; 179 BYTE* oend; 180 BYTE const* iend[4]; 181 } HUF_DecompressFastArgs; 182 183 typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*); 184 185 /** 186 * Initializes args for the fast decoding loop. 187 * @returns 1 on success 188 * 0 if the fallback implementation should be used. 189 * Or an error code on failure. 190 */ 191 static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable) 192 { 193 void const* dt = DTable + 1; 194 U32 const dtLog = HUF_getDTableDesc(DTable).tableLog; 195 196 const BYTE* const istart = (const BYTE*)src; 197 198 BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize); 199 200 /* The fast decoding loop assumes 64-bit little-endian. 201 * This condition is false on x32. 202 */ 203 if (!MEM_isLittleEndian() || MEM_32bits()) 204 return 0; 205 206 /* Avoid nullptr addition */ 207 if (dstSize == 0) 208 return 0; 209 assert(dst != NULL); 210 211 /* strict minimum : jump table + 1 byte per stream */ 212 if (srcSize < 10) 213 return ERROR(corruption_detected); 214 215 /* Must have at least 8 bytes per stream because we don't handle initializing smaller bit containers. 216 * If table log is not correct at this point, fallback to the old decoder. 217 * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder. 218 */ 219 if (dtLog != HUF_DECODER_FAST_TABLELOG) 220 return 0; 221 222 /* Read the jump table. */ 223 { 224 size_t const length1 = MEM_readLE16(istart); 225 size_t const length2 = MEM_readLE16(istart+2); 226 size_t const length3 = MEM_readLE16(istart+4); 227 size_t const length4 = srcSize - (length1 + length2 + length3 + 6); 228 args->iend[0] = istart + 6; /* jumpTable */ 229 args->iend[1] = args->iend[0] + length1; 230 args->iend[2] = args->iend[1] + length2; 231 args->iend[3] = args->iend[2] + length3; 232 233 /* HUF_initFastDStream() requires this, and this small of an input 234 * won't benefit from the ASM loop anyways. 235 */ 236 if (length1 < 8 || length2 < 8 || length3 < 8 || length4 < 8) 237 return 0; 238 if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */ 239 } 240 /* ip[] contains the position that is currently loaded into bits[]. */ 241 args->ip[0] = args->iend[1] - sizeof(U64); 242 args->ip[1] = args->iend[2] - sizeof(U64); 243 args->ip[2] = args->iend[3] - sizeof(U64); 244 args->ip[3] = (BYTE const*)src + srcSize - sizeof(U64); 245 246 /* op[] contains the output pointers. */ 247 args->op[0] = (BYTE*)dst; 248 args->op[1] = args->op[0] + (dstSize+3)/4; 249 args->op[2] = args->op[1] + (dstSize+3)/4; 250 args->op[3] = args->op[2] + (dstSize+3)/4; 251 252 /* No point to call the ASM loop for tiny outputs. */ 253 if (args->op[3] >= oend) 254 return 0; 255 256 /* bits[] is the bit container. 257 * It is read from the MSB down to the LSB. 258 * It is shifted left as it is read, and zeros are 259 * shifted in. After the lowest valid bit a 1 is 260 * set, so that CountTrailingZeros(bits[]) can be used 261 * to count how many bits we've consumed. 262 */ 263 args->bits[0] = HUF_initFastDStream(args->ip[0]); 264 args->bits[1] = HUF_initFastDStream(args->ip[1]); 265 args->bits[2] = HUF_initFastDStream(args->ip[2]); 266 args->bits[3] = HUF_initFastDStream(args->ip[3]); 267 268 /* The decoders must be sure to never read beyond ilowest. 269 * This is lower than iend[0], but allowing decoders to read 270 * down to ilowest can allow an extra iteration or two in the 271 * fast loop. 272 */ 273 args->ilowest = istart; 274 275 args->oend = oend; 276 args->dt = dt; 277 278 return 1; 279 } 280 281 static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd) 282 { 283 /* Validate that we haven't overwritten. */ 284 if (args->op[stream] > segmentEnd) 285 return ERROR(corruption_detected); 286 /* Validate that we haven't read beyond iend[]. 287 * Note that ip[] may be < iend[] because the MSB is 288 * the next bit to read, and we may have consumed 100% 289 * of the stream, so down to iend[i] - 8 is valid. 290 */ 291 if (args->ip[stream] < args->iend[stream] - 8) 292 return ERROR(corruption_detected); 293 294 /* Construct the BIT_DStream_t. */ 295 assert(sizeof(size_t) == 8); 296 bit->bitContainer = MEM_readLEST(args->ip[stream]); 297 bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]); 298 bit->start = (const char*)args->ilowest; 299 bit->limitPtr = bit->start + sizeof(size_t); 300 bit->ptr = (const char*)args->ip[stream]; 301 302 return 0; 303 } 304 305 /* Calls X(N) for each stream 0, 1, 2, 3. */ 306 #define HUF_4X_FOR_EACH_STREAM(X) \ 307 do { \ 308 X(0); \ 309 X(1); \ 310 X(2); \ 311 X(3); \ 312 } while (0) 313 314 /* Calls X(N, var) for each stream 0, 1, 2, 3. */ 315 #define HUF_4X_FOR_EACH_STREAM_WITH_VAR(X, var) \ 316 do { \ 317 X(0, (var)); \ 318 X(1, (var)); \ 319 X(2, (var)); \ 320 X(3, (var)); \ 321 } while (0) 322 323 324 #ifndef HUF_FORCE_DECOMPRESS_X2 325 326 /*-***************************/ 327 /* single-symbol decoding */ 328 /*-***************************/ 329 typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol decoding */ 330 331 /** 332 * Packs 4 HUF_DEltX1 structs into a U64. This is used to lay down 4 entries at 333 * a time. 334 */ 335 static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) { 336 U64 D4; 337 if (MEM_isLittleEndian()) { 338 D4 = (U64)((symbol << 8) + nbBits); 339 } else { 340 D4 = (U64)(symbol + (nbBits << 8)); 341 } 342 assert(D4 < (1U << 16)); 343 D4 *= 0x0001000100010001ULL; 344 return D4; 345 } 346 347 /** 348 * Increase the tableLog to targetTableLog and rescales the stats. 349 * If tableLog > targetTableLog this is a no-op. 350 * @returns New tableLog 351 */ 352 static U32 HUF_rescaleStats(BYTE* huffWeight, U32* rankVal, U32 nbSymbols, U32 tableLog, U32 targetTableLog) 353 { 354 if (tableLog > targetTableLog) 355 return tableLog; 356 if (tableLog < targetTableLog) { 357 U32 const scale = targetTableLog - tableLog; 358 U32 s; 359 /* Increase the weight for all non-zero probability symbols by scale. */ 360 for (s = 0; s < nbSymbols; ++s) { 361 huffWeight[s] += (BYTE)((huffWeight[s] == 0) ? 0 : scale); 362 } 363 /* Update rankVal to reflect the new weights. 364 * All weights except 0 get moved to weight + scale. 365 * Weights [1, scale] are empty. 366 */ 367 for (s = targetTableLog; s > scale; --s) { 368 rankVal[s] = rankVal[s - scale]; 369 } 370 for (s = scale; s > 0; --s) { 371 rankVal[s] = 0; 372 } 373 } 374 return targetTableLog; 375 } 376 377 typedef struct { 378 U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1]; 379 U32 rankStart[HUF_TABLELOG_ABSOLUTEMAX + 1]; 380 U32 statsWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32]; 381 BYTE symbols[HUF_SYMBOLVALUE_MAX + 1]; 382 BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1]; 383 } HUF_ReadDTableX1_Workspace; 384 385 size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags) 386 { 387 U32 tableLog = 0; 388 U32 nbSymbols = 0; 389 size_t iSize; 390 void* const dtPtr = DTable + 1; 391 HUF_DEltX1* const dt = (HUF_DEltX1*)dtPtr; 392 HUF_ReadDTableX1_Workspace* wksp = (HUF_ReadDTableX1_Workspace*)workSpace; 393 394 DEBUG_STATIC_ASSERT(HUF_DECOMPRESS_WORKSPACE_SIZE >= sizeof(*wksp)); 395 if (sizeof(*wksp) > wkspSize) return ERROR(tableLog_tooLarge); 396 397 DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable)); 398 /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */ 399 400 iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags); 401 if (HUF_isError(iSize)) return iSize; 402 403 404 /* Table header */ 405 { DTableDesc dtd = HUF_getDTableDesc(DTable); 406 U32 const maxTableLog = dtd.maxTableLog + 1; 407 U32 const targetTableLog = MIN(maxTableLog, HUF_DECODER_FAST_TABLELOG); 408 tableLog = HUF_rescaleStats(wksp->huffWeight, wksp->rankVal, nbSymbols, tableLog, targetTableLog); 409 if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge); /* DTable too small, Huffman tree cannot fit in */ 410 dtd.tableType = 0; 411 dtd.tableLog = (BYTE)tableLog; 412 ZSTD_memcpy(DTable, &dtd, sizeof(dtd)); 413 } 414 415 /* Compute symbols and rankStart given rankVal: 416 * 417 * rankVal already contains the number of values of each weight. 418 * 419 * symbols contains the symbols ordered by weight. First are the rankVal[0] 420 * weight 0 symbols, followed by the rankVal[1] weight 1 symbols, and so on. 421 * symbols[0] is filled (but unused) to avoid a branch. 422 * 423 * rankStart contains the offset where each rank belongs in the DTable. 424 * rankStart[0] is not filled because there are no entries in the table for 425 * weight 0. 426 */ 427 { int n; 428 U32 nextRankStart = 0; 429 int const unroll = 4; 430 int const nLimit = (int)nbSymbols - unroll + 1; 431 for (n=0; n<(int)tableLog+1; n++) { 432 U32 const curr = nextRankStart; 433 nextRankStart += wksp->rankVal[n]; 434 wksp->rankStart[n] = curr; 435 } 436 for (n=0; n < nLimit; n += unroll) { 437 int u; 438 for (u=0; u < unroll; ++u) { 439 size_t const w = wksp->huffWeight[n+u]; 440 wksp->symbols[wksp->rankStart[w]++] = (BYTE)(n+u); 441 } 442 } 443 for (; n < (int)nbSymbols; ++n) { 444 size_t const w = wksp->huffWeight[n]; 445 wksp->symbols[wksp->rankStart[w]++] = (BYTE)n; 446 } 447 } 448 449 /* fill DTable 450 * We fill all entries of each weight in order. 451 * That way length is a constant for each iteration of the outer loop. 452 * We can switch based on the length to a different inner loop which is 453 * optimized for that particular case. 454 */ 455 { U32 w; 456 int symbol = wksp->rankVal[0]; 457 int rankStart = 0; 458 for (w=1; w<tableLog+1; ++w) { 459 int const symbolCount = wksp->rankVal[w]; 460 int const length = (1 << w) >> 1; 461 int uStart = rankStart; 462 BYTE const nbBits = (BYTE)(tableLog + 1 - w); 463 int s; 464 int u; 465 switch (length) { 466 case 1: 467 for (s=0; s<symbolCount; ++s) { 468 HUF_DEltX1 D; 469 D.byte = wksp->symbols[symbol + s]; 470 D.nbBits = nbBits; 471 dt[uStart] = D; 472 uStart += 1; 473 } 474 break; 475 case 2: 476 for (s=0; s<symbolCount; ++s) { 477 HUF_DEltX1 D; 478 D.byte = wksp->symbols[symbol + s]; 479 D.nbBits = nbBits; 480 dt[uStart+0] = D; 481 dt[uStart+1] = D; 482 uStart += 2; 483 } 484 break; 485 case 4: 486 for (s=0; s<symbolCount; ++s) { 487 U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits); 488 MEM_write64(dt + uStart, D4); 489 uStart += 4; 490 } 491 break; 492 case 8: 493 for (s=0; s<symbolCount; ++s) { 494 U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits); 495 MEM_write64(dt + uStart, D4); 496 MEM_write64(dt + uStart + 4, D4); 497 uStart += 8; 498 } 499 break; 500 default: 501 for (s=0; s<symbolCount; ++s) { 502 U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits); 503 for (u=0; u < length; u += 16) { 504 MEM_write64(dt + uStart + u + 0, D4); 505 MEM_write64(dt + uStart + u + 4, D4); 506 MEM_write64(dt + uStart + u + 8, D4); 507 MEM_write64(dt + uStart + u + 12, D4); 508 } 509 assert(u == length); 510 uStart += length; 511 } 512 break; 513 } 514 symbol += symbolCount; 515 rankStart += symbolCount * length; 516 } 517 } 518 return iSize; 519 } 520 521 FORCE_INLINE_TEMPLATE BYTE 522 HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog) 523 { 524 size_t const val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */ 525 BYTE const c = dt[val].byte; 526 BIT_skipBits(Dstream, dt[val].nbBits); 527 return c; 528 } 529 530 #define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \ 531 do { *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog); } while (0) 532 533 #define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \ 534 do { \ 535 if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ 536 HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \ 537 } while (0) 538 539 #define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \ 540 do { \ 541 if (MEM_64bits()) \ 542 HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \ 543 } while (0) 544 545 HINT_INLINE size_t 546 HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog) 547 { 548 BYTE* const pStart = p; 549 550 /* up to 4 symbols at a time */ 551 if ((pEnd - p) > 3) { 552 while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) { 553 HUF_DECODE_SYMBOLX1_2(p, bitDPtr); 554 HUF_DECODE_SYMBOLX1_1(p, bitDPtr); 555 HUF_DECODE_SYMBOLX1_2(p, bitDPtr); 556 HUF_DECODE_SYMBOLX1_0(p, bitDPtr); 557 } 558 } else { 559 BIT_reloadDStream(bitDPtr); 560 } 561 562 /* [0-3] symbols remaining */ 563 if (MEM_32bits()) 564 while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd)) 565 HUF_DECODE_SYMBOLX1_0(p, bitDPtr); 566 567 /* no more data to retrieve from bitstream, no need to reload */ 568 while (p < pEnd) 569 HUF_DECODE_SYMBOLX1_0(p, bitDPtr); 570 571 return (size_t)(pEnd-pStart); 572 } 573 574 FORCE_INLINE_TEMPLATE size_t 575 HUF_decompress1X1_usingDTable_internal_body( 576 void* dst, size_t dstSize, 577 const void* cSrc, size_t cSrcSize, 578 const HUF_DTable* DTable) 579 { 580 BYTE* op = (BYTE*)dst; 581 BYTE* const oend = ZSTD_maybeNullPtrAdd(op, dstSize); 582 const void* dtPtr = DTable + 1; 583 const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr; 584 BIT_DStream_t bitD; 585 DTableDesc const dtd = HUF_getDTableDesc(DTable); 586 U32 const dtLog = dtd.tableLog; 587 588 CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) ); 589 590 HUF_decodeStreamX1(op, &bitD, oend, dt, dtLog); 591 592 if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected); 593 594 return dstSize; 595 } 596 597 /* HUF_decompress4X1_usingDTable_internal_body(): 598 * Conditions : 599 * @dstSize >= 6 600 */ 601 FORCE_INLINE_TEMPLATE size_t 602 HUF_decompress4X1_usingDTable_internal_body( 603 void* dst, size_t dstSize, 604 const void* cSrc, size_t cSrcSize, 605 const HUF_DTable* DTable) 606 { 607 /* Check */ 608 if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ 609 if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */ 610 611 { const BYTE* const istart = (const BYTE*) cSrc; 612 BYTE* const ostart = (BYTE*) dst; 613 BYTE* const oend = ostart + dstSize; 614 BYTE* const olimit = oend - 3; 615 const void* const dtPtr = DTable + 1; 616 const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr; 617 618 /* Init */ 619 BIT_DStream_t bitD1; 620 BIT_DStream_t bitD2; 621 BIT_DStream_t bitD3; 622 BIT_DStream_t bitD4; 623 size_t const length1 = MEM_readLE16(istart); 624 size_t const length2 = MEM_readLE16(istart+2); 625 size_t const length3 = MEM_readLE16(istart+4); 626 size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6); 627 const BYTE* const istart1 = istart + 6; /* jumpTable */ 628 const BYTE* const istart2 = istart1 + length1; 629 const BYTE* const istart3 = istart2 + length2; 630 const BYTE* const istart4 = istart3 + length3; 631 const size_t segmentSize = (dstSize+3) / 4; 632 BYTE* const opStart2 = ostart + segmentSize; 633 BYTE* const opStart3 = opStart2 + segmentSize; 634 BYTE* const opStart4 = opStart3 + segmentSize; 635 BYTE* op1 = ostart; 636 BYTE* op2 = opStart2; 637 BYTE* op3 = opStart3; 638 BYTE* op4 = opStart4; 639 DTableDesc const dtd = HUF_getDTableDesc(DTable); 640 U32 const dtLog = dtd.tableLog; 641 U32 endSignal = 1; 642 643 if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ 644 if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ 645 assert(dstSize >= 6); /* validated above */ 646 CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); 647 CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); 648 CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); 649 CHECK_F( BIT_initDStream(&bitD4, istart4, length4) ); 650 651 /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */ 652 if ((size_t)(oend - op4) >= sizeof(size_t)) { 653 for ( ; (endSignal) & (op4 < olimit) ; ) { 654 HUF_DECODE_SYMBOLX1_2(op1, &bitD1); 655 HUF_DECODE_SYMBOLX1_2(op2, &bitD2); 656 HUF_DECODE_SYMBOLX1_2(op3, &bitD3); 657 HUF_DECODE_SYMBOLX1_2(op4, &bitD4); 658 HUF_DECODE_SYMBOLX1_1(op1, &bitD1); 659 HUF_DECODE_SYMBOLX1_1(op2, &bitD2); 660 HUF_DECODE_SYMBOLX1_1(op3, &bitD3); 661 HUF_DECODE_SYMBOLX1_1(op4, &bitD4); 662 HUF_DECODE_SYMBOLX1_2(op1, &bitD1); 663 HUF_DECODE_SYMBOLX1_2(op2, &bitD2); 664 HUF_DECODE_SYMBOLX1_2(op3, &bitD3); 665 HUF_DECODE_SYMBOLX1_2(op4, &bitD4); 666 HUF_DECODE_SYMBOLX1_0(op1, &bitD1); 667 HUF_DECODE_SYMBOLX1_0(op2, &bitD2); 668 HUF_DECODE_SYMBOLX1_0(op3, &bitD3); 669 HUF_DECODE_SYMBOLX1_0(op4, &bitD4); 670 endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished; 671 endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished; 672 endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished; 673 endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished; 674 } 675 } 676 677 /* check corruption */ 678 /* note : should not be necessary : op# advance in lock step, and we control op4. 679 * but curiously, binary generated by gcc 7.2 & 7.3 with -mbmi2 runs faster when >=1 test is present */ 680 if (op1 > opStart2) return ERROR(corruption_detected); 681 if (op2 > opStart3) return ERROR(corruption_detected); 682 if (op3 > opStart4) return ERROR(corruption_detected); 683 /* note : op4 supposed already verified within main loop */ 684 685 /* finish bitStreams one by one */ 686 HUF_decodeStreamX1(op1, &bitD1, opStart2, dt, dtLog); 687 HUF_decodeStreamX1(op2, &bitD2, opStart3, dt, dtLog); 688 HUF_decodeStreamX1(op3, &bitD3, opStart4, dt, dtLog); 689 HUF_decodeStreamX1(op4, &bitD4, oend, dt, dtLog); 690 691 /* check */ 692 { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4); 693 if (!endCheck) return ERROR(corruption_detected); } 694 695 /* decoded size */ 696 return dstSize; 697 } 698 } 699 700 #if HUF_NEED_BMI2_FUNCTION 701 static BMI2_TARGET_ATTRIBUTE 702 size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc, 703 size_t cSrcSize, HUF_DTable const* DTable) { 704 return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); 705 } 706 #endif 707 708 static 709 size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, 710 size_t cSrcSize, HUF_DTable const* DTable) { 711 return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); 712 } 713 714 #if ZSTD_ENABLE_ASM_X86_64_BMI2 715 716 HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; 717 718 #endif 719 720 static HUF_FAST_BMI2_ATTRS 721 void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args) 722 { 723 U64 bits[4]; 724 BYTE const* ip[4]; 725 BYTE* op[4]; 726 U16 const* const dtable = (U16 const*)args->dt; 727 BYTE* const oend = args->oend; 728 BYTE const* const ilowest = args->ilowest; 729 730 /* Copy the arguments to local variables */ 731 ZSTD_memcpy(&bits, &args->bits, sizeof(bits)); 732 ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip)); 733 ZSTD_memcpy(&op, &args->op, sizeof(op)); 734 735 assert(MEM_isLittleEndian()); 736 assert(!MEM_32bits()); 737 738 for (;;) { 739 BYTE* olimit; 740 int stream; 741 742 /* Assert loop preconditions */ 743 #ifndef NDEBUG 744 for (stream = 0; stream < 4; ++stream) { 745 assert(op[stream] <= (stream == 3 ? oend : op[stream + 1])); 746 assert(ip[stream] >= ilowest); 747 } 748 #endif 749 /* Compute olimit */ 750 { 751 /* Each iteration produces 5 output symbols per stream */ 752 size_t const oiters = (size_t)(oend - op[3]) / 5; 753 /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes 754 * per stream. 755 */ 756 size_t const iiters = (size_t)(ip[0] - ilowest) / 7; 757 /* We can safely run iters iterations before running bounds checks */ 758 size_t const iters = MIN(oiters, iiters); 759 size_t const symbols = iters * 5; 760 761 /* We can simply check that op[3] < olimit, instead of checking all 762 * of our bounds, since we can't hit the other bounds until we've run 763 * iters iterations, which only happens when op[3] == olimit. 764 */ 765 olimit = op[3] + symbols; 766 767 /* Exit fast decoding loop once we reach the end. */ 768 if (op[3] == olimit) 769 break; 770 771 /* Exit the decoding loop if any input pointer has crossed the 772 * previous one. This indicates corruption, and a precondition 773 * to our loop is that ip[i] >= ip[0]. 774 */ 775 for (stream = 1; stream < 4; ++stream) { 776 if (ip[stream] < ip[stream - 1]) 777 goto _out; 778 } 779 } 780 781 #ifndef NDEBUG 782 for (stream = 1; stream < 4; ++stream) { 783 assert(ip[stream] >= ip[stream - 1]); 784 } 785 #endif 786 787 #define HUF_4X1_DECODE_SYMBOL(_stream, _symbol) \ 788 do { \ 789 int const index = (int)(bits[(_stream)] >> 53); \ 790 int const entry = (int)dtable[index]; \ 791 bits[(_stream)] <<= (entry & 0x3F); \ 792 op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \ 793 } while (0) 794 795 #define HUF_4X1_RELOAD_STREAM(_stream) \ 796 do { \ 797 int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \ 798 int const nbBits = ctz & 7; \ 799 int const nbBytes = ctz >> 3; \ 800 op[(_stream)] += 5; \ 801 ip[(_stream)] -= nbBytes; \ 802 bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \ 803 bits[(_stream)] <<= nbBits; \ 804 } while (0) 805 806 /* Manually unroll the loop because compilers don't consistently 807 * unroll the inner loops, which destroys performance. 808 */ 809 do { 810 /* Decode 5 symbols in each of the 4 streams */ 811 HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0); 812 HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1); 813 HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2); 814 HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3); 815 HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4); 816 817 /* Reload each of the 4 the bitstreams */ 818 HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM); 819 } while (op[3] < olimit); 820 821 #undef HUF_4X1_DECODE_SYMBOL 822 #undef HUF_4X1_RELOAD_STREAM 823 } 824 825 _out: 826 827 /* Save the final values of each of the state variables back to args. */ 828 ZSTD_memcpy(&args->bits, &bits, sizeof(bits)); 829 ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip)); 830 ZSTD_memcpy(&args->op, &op, sizeof(op)); 831 } 832 833 /** 834 * @returns @p dstSize on success (>= 6) 835 * 0 if the fallback implementation should be used 836 * An error if an error occurred 837 */ 838 static HUF_FAST_BMI2_ATTRS 839 size_t 840 HUF_decompress4X1_usingDTable_internal_fast( 841 void* dst, size_t dstSize, 842 const void* cSrc, size_t cSrcSize, 843 const HUF_DTable* DTable, 844 HUF_DecompressFastLoopFn loopFn) 845 { 846 void const* dt = DTable + 1; 847 BYTE const* const ilowest = (BYTE const*)cSrc; 848 BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize); 849 HUF_DecompressFastArgs args; 850 { size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); 851 FORWARD_IF_ERROR(ret, "Failed to init fast loop args"); 852 if (ret == 0) 853 return 0; 854 } 855 856 assert(args.ip[0] >= args.ilowest); 857 loopFn(&args); 858 859 /* Our loop guarantees that ip[] >= ilowest and that we haven't 860 * overwritten any op[]. 861 */ 862 assert(args.ip[0] >= ilowest); 863 assert(args.ip[0] >= ilowest); 864 assert(args.ip[1] >= ilowest); 865 assert(args.ip[2] >= ilowest); 866 assert(args.ip[3] >= ilowest); 867 assert(args.op[3] <= oend); 868 869 assert(ilowest == args.ilowest); 870 assert(ilowest + 6 == args.iend[0]); 871 (void)ilowest; 872 873 /* finish bit streams one by one. */ 874 { size_t const segmentSize = (dstSize+3) / 4; 875 BYTE* segmentEnd = (BYTE*)dst; 876 int i; 877 for (i = 0; i < 4; ++i) { 878 BIT_DStream_t bit; 879 if (segmentSize <= (size_t)(oend - segmentEnd)) 880 segmentEnd += segmentSize; 881 else 882 segmentEnd = oend; 883 FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption"); 884 /* Decompress and validate that we've produced exactly the expected length. */ 885 args.op[i] += HUF_decodeStreamX1(args.op[i], &bit, segmentEnd, (HUF_DEltX1 const*)dt, HUF_DECODER_FAST_TABLELOG); 886 if (args.op[i] != segmentEnd) return ERROR(corruption_detected); 887 } 888 } 889 890 /* decoded size */ 891 assert(dstSize != 0); 892 return dstSize; 893 } 894 895 HUF_DGEN(HUF_decompress1X1_usingDTable_internal) 896 897 static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, 898 size_t cSrcSize, HUF_DTable const* DTable, int flags) 899 { 900 HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default; 901 HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop; 902 903 #if DYNAMIC_BMI2 904 if (flags & HUF_flags_bmi2) { 905 fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2; 906 # if ZSTD_ENABLE_ASM_X86_64_BMI2 907 if (!(flags & HUF_flags_disableAsm)) { 908 loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; 909 } 910 # endif 911 } else { 912 return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); 913 } 914 #endif 915 916 #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) 917 if (!(flags & HUF_flags_disableAsm)) { 918 loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; 919 } 920 #endif 921 922 if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) { 923 size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn); 924 if (ret != 0) 925 return ret; 926 } 927 return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); 928 } 929 930 static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, 931 const void* cSrc, size_t cSrcSize, 932 void* workSpace, size_t wkspSize, int flags) 933 { 934 const BYTE* ip = (const BYTE*) cSrc; 935 936 size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags); 937 if (HUF_isError(hSize)) return hSize; 938 if (hSize >= cSrcSize) return ERROR(srcSize_wrong); 939 ip += hSize; cSrcSize -= hSize; 940 941 return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); 942 } 943 944 #endif /* HUF_FORCE_DECOMPRESS_X2 */ 945 946 947 #ifndef HUF_FORCE_DECOMPRESS_X1 948 949 /* *************************/ 950 /* double-symbols decoding */ 951 /* *************************/ 952 953 typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX2; /* double-symbols decoding */ 954 typedef struct { BYTE symbol; } sortedSymbol_t; 955 typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1]; 956 typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX]; 957 958 /** 959 * Constructs a HUF_DEltX2 in a U32. 960 */ 961 static U32 HUF_buildDEltX2U32(U32 symbol, U32 nbBits, U32 baseSeq, int level) 962 { 963 U32 seq; 964 DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, sequence) == 0); 965 DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, nbBits) == 2); 966 DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, length) == 3); 967 DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(U32)); 968 if (MEM_isLittleEndian()) { 969 seq = level == 1 ? symbol : (baseSeq + (symbol << 8)); 970 return seq + (nbBits << 16) + ((U32)level << 24); 971 } else { 972 seq = level == 1 ? (symbol << 8) : ((baseSeq << 8) + symbol); 973 return (seq << 16) + (nbBits << 8) + (U32)level; 974 } 975 } 976 977 /** 978 * Constructs a HUF_DEltX2. 979 */ 980 static HUF_DEltX2 HUF_buildDEltX2(U32 symbol, U32 nbBits, U32 baseSeq, int level) 981 { 982 HUF_DEltX2 DElt; 983 U32 const val = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level); 984 DEBUG_STATIC_ASSERT(sizeof(DElt) == sizeof(val)); 985 ZSTD_memcpy(&DElt, &val, sizeof(val)); 986 return DElt; 987 } 988 989 /** 990 * Constructs 2 HUF_DEltX2s and packs them into a U64. 991 */ 992 static U64 HUF_buildDEltX2U64(U32 symbol, U32 nbBits, U16 baseSeq, int level) 993 { 994 U32 DElt = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level); 995 return (U64)DElt + ((U64)DElt << 32); 996 } 997 998 /** 999 * Fills the DTable rank with all the symbols from [begin, end) that are each 1000 * nbBits long. 1001 * 1002 * @param DTableRank The start of the rank in the DTable. 1003 * @param begin The first symbol to fill (inclusive). 1004 * @param end The last symbol to fill (exclusive). 1005 * @param nbBits Each symbol is nbBits long. 1006 * @param tableLog The table log. 1007 * @param baseSeq If level == 1 { 0 } else { the first level symbol } 1008 * @param level The level in the table. Must be 1 or 2. 1009 */ 1010 static void HUF_fillDTableX2ForWeight( 1011 HUF_DEltX2* DTableRank, 1012 sortedSymbol_t const* begin, sortedSymbol_t const* end, 1013 U32 nbBits, U32 tableLog, 1014 U16 baseSeq, int const level) 1015 { 1016 U32 const length = 1U << ((tableLog - nbBits) & 0x1F /* quiet static-analyzer */); 1017 const sortedSymbol_t* ptr; 1018 assert(level >= 1 && level <= 2); 1019 switch (length) { 1020 case 1: 1021 for (ptr = begin; ptr != end; ++ptr) { 1022 HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level); 1023 *DTableRank++ = DElt; 1024 } 1025 break; 1026 case 2: 1027 for (ptr = begin; ptr != end; ++ptr) { 1028 HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level); 1029 DTableRank[0] = DElt; 1030 DTableRank[1] = DElt; 1031 DTableRank += 2; 1032 } 1033 break; 1034 case 4: 1035 for (ptr = begin; ptr != end; ++ptr) { 1036 U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level); 1037 ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2)); 1038 ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2)); 1039 DTableRank += 4; 1040 } 1041 break; 1042 case 8: 1043 for (ptr = begin; ptr != end; ++ptr) { 1044 U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level); 1045 ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2)); 1046 ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2)); 1047 ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2)); 1048 ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2)); 1049 DTableRank += 8; 1050 } 1051 break; 1052 default: 1053 for (ptr = begin; ptr != end; ++ptr) { 1054 U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level); 1055 HUF_DEltX2* const DTableRankEnd = DTableRank + length; 1056 for (; DTableRank != DTableRankEnd; DTableRank += 8) { 1057 ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2)); 1058 ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2)); 1059 ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2)); 1060 ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2)); 1061 } 1062 } 1063 break; 1064 } 1065 } 1066 1067 /* HUF_fillDTableX2Level2() : 1068 * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */ 1069 static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32 consumedBits, 1070 const U32* rankVal, const int minWeight, const int maxWeight1, 1071 const sortedSymbol_t* sortedSymbols, U32 const* rankStart, 1072 U32 nbBitsBaseline, U16 baseSeq) 1073 { 1074 /* Fill skipped values (all positions up to rankVal[minWeight]). 1075 * These are positions only get a single symbol because the combined weight 1076 * is too large. 1077 */ 1078 if (minWeight>1) { 1079 U32 const length = 1U << ((targetLog - consumedBits) & 0x1F /* quiet static-analyzer */); 1080 U64 const DEltX2 = HUF_buildDEltX2U64(baseSeq, consumedBits, /* baseSeq */ 0, /* level */ 1); 1081 int const skipSize = rankVal[minWeight]; 1082 assert(length > 1); 1083 assert((U32)skipSize < length); 1084 switch (length) { 1085 case 2: 1086 assert(skipSize == 1); 1087 ZSTD_memcpy(DTable, &DEltX2, sizeof(DEltX2)); 1088 break; 1089 case 4: 1090 assert(skipSize <= 4); 1091 ZSTD_memcpy(DTable + 0, &DEltX2, sizeof(DEltX2)); 1092 ZSTD_memcpy(DTable + 2, &DEltX2, sizeof(DEltX2)); 1093 break; 1094 default: 1095 { 1096 int i; 1097 for (i = 0; i < skipSize; i += 8) { 1098 ZSTD_memcpy(DTable + i + 0, &DEltX2, sizeof(DEltX2)); 1099 ZSTD_memcpy(DTable + i + 2, &DEltX2, sizeof(DEltX2)); 1100 ZSTD_memcpy(DTable + i + 4, &DEltX2, sizeof(DEltX2)); 1101 ZSTD_memcpy(DTable + i + 6, &DEltX2, sizeof(DEltX2)); 1102 } 1103 } 1104 } 1105 } 1106 1107 /* Fill each of the second level symbols by weight. */ 1108 { 1109 int w; 1110 for (w = minWeight; w < maxWeight1; ++w) { 1111 int const begin = rankStart[w]; 1112 int const end = rankStart[w+1]; 1113 U32 const nbBits = nbBitsBaseline - w; 1114 U32 const totalBits = nbBits + consumedBits; 1115 HUF_fillDTableX2ForWeight( 1116 DTable + rankVal[w], 1117 sortedSymbols + begin, sortedSymbols + end, 1118 totalBits, targetLog, 1119 baseSeq, /* level */ 2); 1120 } 1121 } 1122 } 1123 1124 static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog, 1125 const sortedSymbol_t* sortedList, 1126 const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight, 1127 const U32 nbBitsBaseline) 1128 { 1129 U32* const rankVal = rankValOrigin[0]; 1130 const int scaleLog = nbBitsBaseline - targetLog; /* note : targetLog >= srcLog, hence scaleLog <= 1 */ 1131 const U32 minBits = nbBitsBaseline - maxWeight; 1132 int w; 1133 int const wEnd = (int)maxWeight + 1; 1134 1135 /* Fill DTable in order of weight. */ 1136 for (w = 1; w < wEnd; ++w) { 1137 int const begin = (int)rankStart[w]; 1138 int const end = (int)rankStart[w+1]; 1139 U32 const nbBits = nbBitsBaseline - w; 1140 1141 if (targetLog-nbBits >= minBits) { 1142 /* Enough room for a second symbol. */ 1143 int start = rankVal[w]; 1144 U32 const length = 1U << ((targetLog - nbBits) & 0x1F /* quiet static-analyzer */); 1145 int minWeight = nbBits + scaleLog; 1146 int s; 1147 if (minWeight < 1) minWeight = 1; 1148 /* Fill the DTable for every symbol of weight w. 1149 * These symbols get at least 1 second symbol. 1150 */ 1151 for (s = begin; s != end; ++s) { 1152 HUF_fillDTableX2Level2( 1153 DTable + start, targetLog, nbBits, 1154 rankValOrigin[nbBits], minWeight, wEnd, 1155 sortedList, rankStart, 1156 nbBitsBaseline, sortedList[s].symbol); 1157 start += length; 1158 } 1159 } else { 1160 /* Only a single symbol. */ 1161 HUF_fillDTableX2ForWeight( 1162 DTable + rankVal[w], 1163 sortedList + begin, sortedList + end, 1164 nbBits, targetLog, 1165 /* baseSeq */ 0, /* level */ 1); 1166 } 1167 } 1168 } 1169 1170 typedef struct { 1171 rankValCol_t rankVal[HUF_TABLELOG_MAX]; 1172 U32 rankStats[HUF_TABLELOG_MAX + 1]; 1173 U32 rankStart0[HUF_TABLELOG_MAX + 3]; 1174 sortedSymbol_t sortedSymbol[HUF_SYMBOLVALUE_MAX + 1]; 1175 BYTE weightList[HUF_SYMBOLVALUE_MAX + 1]; 1176 U32 calleeWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32]; 1177 } HUF_ReadDTableX2_Workspace; 1178 1179 size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, 1180 const void* src, size_t srcSize, 1181 void* workSpace, size_t wkspSize, int flags) 1182 { 1183 U32 tableLog, maxW, nbSymbols; 1184 DTableDesc dtd = HUF_getDTableDesc(DTable); 1185 U32 maxTableLog = dtd.maxTableLog; 1186 size_t iSize; 1187 void* dtPtr = DTable+1; /* force compiler to avoid strict-aliasing */ 1188 HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr; 1189 U32 *rankStart; 1190 1191 HUF_ReadDTableX2_Workspace* const wksp = (HUF_ReadDTableX2_Workspace*)workSpace; 1192 1193 if (sizeof(*wksp) > wkspSize) return ERROR(GENERIC); 1194 1195 rankStart = wksp->rankStart0 + 1; 1196 ZSTD_memset(wksp->rankStats, 0, sizeof(wksp->rankStats)); 1197 ZSTD_memset(wksp->rankStart0, 0, sizeof(wksp->rankStart0)); 1198 1199 DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(HUF_DTable)); /* if compiler fails here, assertion is wrong */ 1200 if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); 1201 /* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */ 1202 1203 iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags); 1204 if (HUF_isError(iSize)) return iSize; 1205 1206 /* check result */ 1207 if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge); /* DTable can't fit code depth */ 1208 if (tableLog <= HUF_DECODER_FAST_TABLELOG && maxTableLog > HUF_DECODER_FAST_TABLELOG) maxTableLog = HUF_DECODER_FAST_TABLELOG; 1209 1210 /* find maxWeight */ 1211 for (maxW = tableLog; wksp->rankStats[maxW]==0; maxW--) {} /* necessarily finds a solution before 0 */ 1212 1213 /* Get start index of each weight */ 1214 { U32 w, nextRankStart = 0; 1215 for (w=1; w<maxW+1; w++) { 1216 U32 curr = nextRankStart; 1217 nextRankStart += wksp->rankStats[w]; 1218 rankStart[w] = curr; 1219 } 1220 rankStart[0] = nextRankStart; /* put all 0w symbols at the end of sorted list*/ 1221 rankStart[maxW+1] = nextRankStart; 1222 } 1223 1224 /* sort symbols by weight */ 1225 { U32 s; 1226 for (s=0; s<nbSymbols; s++) { 1227 U32 const w = wksp->weightList[s]; 1228 U32 const r = rankStart[w]++; 1229 wksp->sortedSymbol[r].symbol = (BYTE)s; 1230 } 1231 rankStart[0] = 0; /* forget 0w symbols; this is beginning of weight(1) */ 1232 } 1233 1234 /* Build rankVal */ 1235 { U32* const rankVal0 = wksp->rankVal[0]; 1236 { int const rescale = (maxTableLog-tableLog) - 1; /* tableLog <= maxTableLog */ 1237 U32 nextRankVal = 0; 1238 U32 w; 1239 for (w=1; w<maxW+1; w++) { 1240 U32 curr = nextRankVal; 1241 nextRankVal += wksp->rankStats[w] << (w+rescale); 1242 rankVal0[w] = curr; 1243 } } 1244 { U32 const minBits = tableLog+1 - maxW; 1245 U32 consumed; 1246 for (consumed = minBits; consumed < maxTableLog - minBits + 1; consumed++) { 1247 U32* const rankValPtr = wksp->rankVal[consumed]; 1248 U32 w; 1249 for (w = 1; w < maxW+1; w++) { 1250 rankValPtr[w] = rankVal0[w] >> consumed; 1251 } } } } 1252 1253 HUF_fillDTableX2(dt, maxTableLog, 1254 wksp->sortedSymbol, 1255 wksp->rankStart0, wksp->rankVal, maxW, 1256 tableLog+1); 1257 1258 dtd.tableLog = (BYTE)maxTableLog; 1259 dtd.tableType = 1; 1260 ZSTD_memcpy(DTable, &dtd, sizeof(dtd)); 1261 return iSize; 1262 } 1263 1264 1265 FORCE_INLINE_TEMPLATE U32 1266 HUF_decodeSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog) 1267 { 1268 size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */ 1269 ZSTD_memcpy(op, &dt[val].sequence, 2); 1270 BIT_skipBits(DStream, dt[val].nbBits); 1271 return dt[val].length; 1272 } 1273 1274 FORCE_INLINE_TEMPLATE U32 1275 HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog) 1276 { 1277 size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */ 1278 ZSTD_memcpy(op, &dt[val].sequence, 1); 1279 if (dt[val].length==1) { 1280 BIT_skipBits(DStream, dt[val].nbBits); 1281 } else { 1282 if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) { 1283 BIT_skipBits(DStream, dt[val].nbBits); 1284 if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8)) 1285 /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */ 1286 DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8); 1287 } 1288 } 1289 return 1; 1290 } 1291 1292 #define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \ 1293 do { ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); } while (0) 1294 1295 #define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \ 1296 do { \ 1297 if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ 1298 ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \ 1299 } while (0) 1300 1301 #define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \ 1302 do { \ 1303 if (MEM_64bits()) \ 1304 ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \ 1305 } while (0) 1306 1307 HINT_INLINE size_t 1308 HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, 1309 const HUF_DEltX2* const dt, const U32 dtLog) 1310 { 1311 BYTE* const pStart = p; 1312 1313 /* up to 8 symbols at a time */ 1314 if ((size_t)(pEnd - p) >= sizeof(bitDPtr->bitContainer)) { 1315 if (dtLog <= 11 && MEM_64bits()) { 1316 /* up to 10 symbols at a time */ 1317 while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-9)) { 1318 HUF_DECODE_SYMBOLX2_0(p, bitDPtr); 1319 HUF_DECODE_SYMBOLX2_0(p, bitDPtr); 1320 HUF_DECODE_SYMBOLX2_0(p, bitDPtr); 1321 HUF_DECODE_SYMBOLX2_0(p, bitDPtr); 1322 HUF_DECODE_SYMBOLX2_0(p, bitDPtr); 1323 } 1324 } else { 1325 /* up to 8 symbols at a time */ 1326 while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) { 1327 HUF_DECODE_SYMBOLX2_2(p, bitDPtr); 1328 HUF_DECODE_SYMBOLX2_1(p, bitDPtr); 1329 HUF_DECODE_SYMBOLX2_2(p, bitDPtr); 1330 HUF_DECODE_SYMBOLX2_0(p, bitDPtr); 1331 } 1332 } 1333 } else { 1334 BIT_reloadDStream(bitDPtr); 1335 } 1336 1337 /* closer to end : up to 2 symbols at a time */ 1338 if ((size_t)(pEnd - p) >= 2) { 1339 while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2)) 1340 HUF_DECODE_SYMBOLX2_0(p, bitDPtr); 1341 1342 while (p <= pEnd-2) 1343 HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : reached the end of DStream */ 1344 } 1345 1346 if (p < pEnd) 1347 p += HUF_decodeLastSymbolX2(p, bitDPtr, dt, dtLog); 1348 1349 return p-pStart; 1350 } 1351 1352 FORCE_INLINE_TEMPLATE size_t 1353 HUF_decompress1X2_usingDTable_internal_body( 1354 void* dst, size_t dstSize, 1355 const void* cSrc, size_t cSrcSize, 1356 const HUF_DTable* DTable) 1357 { 1358 BIT_DStream_t bitD; 1359 1360 /* Init */ 1361 CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) ); 1362 1363 /* decode */ 1364 { BYTE* const ostart = (BYTE*) dst; 1365 BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, dstSize); 1366 const void* const dtPtr = DTable+1; /* force compiler to not use strict-aliasing */ 1367 const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr; 1368 DTableDesc const dtd = HUF_getDTableDesc(DTable); 1369 HUF_decodeStreamX2(ostart, &bitD, oend, dt, dtd.tableLog); 1370 } 1371 1372 /* check */ 1373 if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected); 1374 1375 /* decoded size */ 1376 return dstSize; 1377 } 1378 1379 /* HUF_decompress4X2_usingDTable_internal_body(): 1380 * Conditions: 1381 * @dstSize >= 6 1382 */ 1383 FORCE_INLINE_TEMPLATE size_t 1384 HUF_decompress4X2_usingDTable_internal_body( 1385 void* dst, size_t dstSize, 1386 const void* cSrc, size_t cSrcSize, 1387 const HUF_DTable* DTable) 1388 { 1389 if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ 1390 if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */ 1391 1392 { const BYTE* const istart = (const BYTE*) cSrc; 1393 BYTE* const ostart = (BYTE*) dst; 1394 BYTE* const oend = ostart + dstSize; 1395 BYTE* const olimit = oend - (sizeof(size_t)-1); 1396 const void* const dtPtr = DTable+1; 1397 const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr; 1398 1399 /* Init */ 1400 BIT_DStream_t bitD1; 1401 BIT_DStream_t bitD2; 1402 BIT_DStream_t bitD3; 1403 BIT_DStream_t bitD4; 1404 size_t const length1 = MEM_readLE16(istart); 1405 size_t const length2 = MEM_readLE16(istart+2); 1406 size_t const length3 = MEM_readLE16(istart+4); 1407 size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6); 1408 const BYTE* const istart1 = istart + 6; /* jumpTable */ 1409 const BYTE* const istart2 = istart1 + length1; 1410 const BYTE* const istart3 = istart2 + length2; 1411 const BYTE* const istart4 = istart3 + length3; 1412 size_t const segmentSize = (dstSize+3) / 4; 1413 BYTE* const opStart2 = ostart + segmentSize; 1414 BYTE* const opStart3 = opStart2 + segmentSize; 1415 BYTE* const opStart4 = opStart3 + segmentSize; 1416 BYTE* op1 = ostart; 1417 BYTE* op2 = opStart2; 1418 BYTE* op3 = opStart3; 1419 BYTE* op4 = opStart4; 1420 U32 endSignal = 1; 1421 DTableDesc const dtd = HUF_getDTableDesc(DTable); 1422 U32 const dtLog = dtd.tableLog; 1423 1424 if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ 1425 if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ 1426 assert(dstSize >= 6 /* validated above */); 1427 CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); 1428 CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); 1429 CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); 1430 CHECK_F( BIT_initDStream(&bitD4, istart4, length4) ); 1431 1432 /* 16-32 symbols per loop (4-8 symbols per stream) */ 1433 if ((size_t)(oend - op4) >= sizeof(size_t)) { 1434 for ( ; (endSignal) & (op4 < olimit); ) { 1435 #if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) 1436 HUF_DECODE_SYMBOLX2_2(op1, &bitD1); 1437 HUF_DECODE_SYMBOLX2_1(op1, &bitD1); 1438 HUF_DECODE_SYMBOLX2_2(op1, &bitD1); 1439 HUF_DECODE_SYMBOLX2_0(op1, &bitD1); 1440 HUF_DECODE_SYMBOLX2_2(op2, &bitD2); 1441 HUF_DECODE_SYMBOLX2_1(op2, &bitD2); 1442 HUF_DECODE_SYMBOLX2_2(op2, &bitD2); 1443 HUF_DECODE_SYMBOLX2_0(op2, &bitD2); 1444 endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished; 1445 endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished; 1446 HUF_DECODE_SYMBOLX2_2(op3, &bitD3); 1447 HUF_DECODE_SYMBOLX2_1(op3, &bitD3); 1448 HUF_DECODE_SYMBOLX2_2(op3, &bitD3); 1449 HUF_DECODE_SYMBOLX2_0(op3, &bitD3); 1450 HUF_DECODE_SYMBOLX2_2(op4, &bitD4); 1451 HUF_DECODE_SYMBOLX2_1(op4, &bitD4); 1452 HUF_DECODE_SYMBOLX2_2(op4, &bitD4); 1453 HUF_DECODE_SYMBOLX2_0(op4, &bitD4); 1454 endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished; 1455 endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished; 1456 #else 1457 HUF_DECODE_SYMBOLX2_2(op1, &bitD1); 1458 HUF_DECODE_SYMBOLX2_2(op2, &bitD2); 1459 HUF_DECODE_SYMBOLX2_2(op3, &bitD3); 1460 HUF_DECODE_SYMBOLX2_2(op4, &bitD4); 1461 HUF_DECODE_SYMBOLX2_1(op1, &bitD1); 1462 HUF_DECODE_SYMBOLX2_1(op2, &bitD2); 1463 HUF_DECODE_SYMBOLX2_1(op3, &bitD3); 1464 HUF_DECODE_SYMBOLX2_1(op4, &bitD4); 1465 HUF_DECODE_SYMBOLX2_2(op1, &bitD1); 1466 HUF_DECODE_SYMBOLX2_2(op2, &bitD2); 1467 HUF_DECODE_SYMBOLX2_2(op3, &bitD3); 1468 HUF_DECODE_SYMBOLX2_2(op4, &bitD4); 1469 HUF_DECODE_SYMBOLX2_0(op1, &bitD1); 1470 HUF_DECODE_SYMBOLX2_0(op2, &bitD2); 1471 HUF_DECODE_SYMBOLX2_0(op3, &bitD3); 1472 HUF_DECODE_SYMBOLX2_0(op4, &bitD4); 1473 endSignal = (U32)LIKELY((U32) 1474 (BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished) 1475 & (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished) 1476 & (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished) 1477 & (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished)); 1478 #endif 1479 } 1480 } 1481 1482 /* check corruption */ 1483 if (op1 > opStart2) return ERROR(corruption_detected); 1484 if (op2 > opStart3) return ERROR(corruption_detected); 1485 if (op3 > opStart4) return ERROR(corruption_detected); 1486 /* note : op4 already verified within main loop */ 1487 1488 /* finish bitStreams one by one */ 1489 HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog); 1490 HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog); 1491 HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog); 1492 HUF_decodeStreamX2(op4, &bitD4, oend, dt, dtLog); 1493 1494 /* check */ 1495 { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4); 1496 if (!endCheck) return ERROR(corruption_detected); } 1497 1498 /* decoded size */ 1499 return dstSize; 1500 } 1501 } 1502 1503 #if HUF_NEED_BMI2_FUNCTION 1504 static BMI2_TARGET_ATTRIBUTE 1505 size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc, 1506 size_t cSrcSize, HUF_DTable const* DTable) { 1507 return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); 1508 } 1509 #endif 1510 1511 static 1512 size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, 1513 size_t cSrcSize, HUF_DTable const* DTable) { 1514 return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); 1515 } 1516 1517 #if ZSTD_ENABLE_ASM_X86_64_BMI2 1518 1519 HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; 1520 1521 #endif 1522 1523 static HUF_FAST_BMI2_ATTRS 1524 void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args) 1525 { 1526 U64 bits[4]; 1527 BYTE const* ip[4]; 1528 BYTE* op[4]; 1529 BYTE* oend[4]; 1530 HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt; 1531 BYTE const* const ilowest = args->ilowest; 1532 1533 /* Copy the arguments to local registers. */ 1534 ZSTD_memcpy(&bits, &args->bits, sizeof(bits)); 1535 ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip)); 1536 ZSTD_memcpy(&op, &args->op, sizeof(op)); 1537 1538 oend[0] = op[1]; 1539 oend[1] = op[2]; 1540 oend[2] = op[3]; 1541 oend[3] = args->oend; 1542 1543 assert(MEM_isLittleEndian()); 1544 assert(!MEM_32bits()); 1545 1546 for (;;) { 1547 BYTE* olimit; 1548 int stream; 1549 1550 /* Assert loop preconditions */ 1551 #ifndef NDEBUG 1552 for (stream = 0; stream < 4; ++stream) { 1553 assert(op[stream] <= oend[stream]); 1554 assert(ip[stream] >= ilowest); 1555 } 1556 #endif 1557 /* Compute olimit */ 1558 { 1559 /* Each loop does 5 table lookups for each of the 4 streams. 1560 * Each table lookup consumes up to 11 bits of input, and produces 1561 * up to 2 bytes of output. 1562 */ 1563 /* We can consume up to 7 bytes of input per iteration per stream. 1564 * We also know that each input pointer is >= ip[0]. So we can run 1565 * iters loops before running out of input. 1566 */ 1567 size_t iters = (size_t)(ip[0] - ilowest) / 7; 1568 /* Each iteration can produce up to 10 bytes of output per stream. 1569 * Each output stream my advance at different rates. So take the 1570 * minimum number of safe iterations among all the output streams. 1571 */ 1572 for (stream = 0; stream < 4; ++stream) { 1573 size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10; 1574 iters = MIN(iters, oiters); 1575 } 1576 1577 /* Each iteration produces at least 5 output symbols. So until 1578 * op[3] crosses olimit, we know we haven't executed iters 1579 * iterations yet. This saves us maintaining an iters counter, 1580 * at the expense of computing the remaining # of iterations 1581 * more frequently. 1582 */ 1583 olimit = op[3] + (iters * 5); 1584 1585 /* Exit the fast decoding loop once we reach the end. */ 1586 if (op[3] == olimit) 1587 break; 1588 1589 /* Exit the decoding loop if any input pointer has crossed the 1590 * previous one. This indicates corruption, and a precondition 1591 * to our loop is that ip[i] >= ip[0]. 1592 */ 1593 for (stream = 1; stream < 4; ++stream) { 1594 if (ip[stream] < ip[stream - 1]) 1595 goto _out; 1596 } 1597 } 1598 1599 #ifndef NDEBUG 1600 for (stream = 1; stream < 4; ++stream) { 1601 assert(ip[stream] >= ip[stream - 1]); 1602 } 1603 #endif 1604 1605 #define HUF_4X2_DECODE_SYMBOL(_stream, _decode3) \ 1606 do { \ 1607 if ((_decode3) || (_stream) != 3) { \ 1608 int const index = (int)(bits[(_stream)] >> 53); \ 1609 HUF_DEltX2 const entry = dtable[index]; \ 1610 MEM_write16(op[(_stream)], entry.sequence); \ 1611 bits[(_stream)] <<= (entry.nbBits) & 0x3F; \ 1612 op[(_stream)] += (entry.length); \ 1613 } \ 1614 } while (0) 1615 1616 #define HUF_4X2_RELOAD_STREAM(_stream) \ 1617 do { \ 1618 HUF_4X2_DECODE_SYMBOL(3, 1); \ 1619 { \ 1620 int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \ 1621 int const nbBits = ctz & 7; \ 1622 int const nbBytes = ctz >> 3; \ 1623 ip[(_stream)] -= nbBytes; \ 1624 bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \ 1625 bits[(_stream)] <<= nbBits; \ 1626 } \ 1627 } while (0) 1628 1629 /* Manually unroll the loop because compilers don't consistently 1630 * unroll the inner loops, which destroys performance. 1631 */ 1632 do { 1633 /* Decode 5 symbols from each of the first 3 streams. 1634 * The final stream will be decoded during the reload phase 1635 * to reduce register pressure. 1636 */ 1637 HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); 1638 HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); 1639 HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); 1640 HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); 1641 HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); 1642 1643 /* Decode one symbol from the final stream */ 1644 HUF_4X2_DECODE_SYMBOL(3, 1); 1645 1646 /* Decode 4 symbols from the final stream & reload bitstreams. 1647 * The final stream is reloaded last, meaning that all 5 symbols 1648 * are decoded from the final stream before it is reloaded. 1649 */ 1650 HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM); 1651 } while (op[3] < olimit); 1652 } 1653 1654 #undef HUF_4X2_DECODE_SYMBOL 1655 #undef HUF_4X2_RELOAD_STREAM 1656 1657 _out: 1658 1659 /* Save the final values of each of the state variables back to args. */ 1660 ZSTD_memcpy(&args->bits, &bits, sizeof(bits)); 1661 ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip)); 1662 ZSTD_memcpy(&args->op, &op, sizeof(op)); 1663 } 1664 1665 1666 static HUF_FAST_BMI2_ATTRS size_t 1667 HUF_decompress4X2_usingDTable_internal_fast( 1668 void* dst, size_t dstSize, 1669 const void* cSrc, size_t cSrcSize, 1670 const HUF_DTable* DTable, 1671 HUF_DecompressFastLoopFn loopFn) { 1672 void const* dt = DTable + 1; 1673 const BYTE* const ilowest = (const BYTE*)cSrc; 1674 BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize); 1675 HUF_DecompressFastArgs args; 1676 { 1677 size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); 1678 FORWARD_IF_ERROR(ret, "Failed to init asm args"); 1679 if (ret == 0) 1680 return 0; 1681 } 1682 1683 assert(args.ip[0] >= args.ilowest); 1684 loopFn(&args); 1685 1686 /* note : op4 already verified within main loop */ 1687 assert(args.ip[0] >= ilowest); 1688 assert(args.ip[1] >= ilowest); 1689 assert(args.ip[2] >= ilowest); 1690 assert(args.ip[3] >= ilowest); 1691 assert(args.op[3] <= oend); 1692 1693 assert(ilowest == args.ilowest); 1694 assert(ilowest + 6 == args.iend[0]); 1695 (void)ilowest; 1696 1697 /* finish bitStreams one by one */ 1698 { 1699 size_t const segmentSize = (dstSize+3) / 4; 1700 BYTE* segmentEnd = (BYTE*)dst; 1701 int i; 1702 for (i = 0; i < 4; ++i) { 1703 BIT_DStream_t bit; 1704 if (segmentSize <= (size_t)(oend - segmentEnd)) 1705 segmentEnd += segmentSize; 1706 else 1707 segmentEnd = oend; 1708 FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption"); 1709 args.op[i] += HUF_decodeStreamX2(args.op[i], &bit, segmentEnd, (HUF_DEltX2 const*)dt, HUF_DECODER_FAST_TABLELOG); 1710 if (args.op[i] != segmentEnd) 1711 return ERROR(corruption_detected); 1712 } 1713 } 1714 1715 /* decoded size */ 1716 return dstSize; 1717 } 1718 1719 static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, 1720 size_t cSrcSize, HUF_DTable const* DTable, int flags) 1721 { 1722 HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default; 1723 HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop; 1724 1725 #if DYNAMIC_BMI2 1726 if (flags & HUF_flags_bmi2) { 1727 fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2; 1728 # if ZSTD_ENABLE_ASM_X86_64_BMI2 1729 if (!(flags & HUF_flags_disableAsm)) { 1730 loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; 1731 } 1732 # endif 1733 } else { 1734 return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); 1735 } 1736 #endif 1737 1738 #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) 1739 if (!(flags & HUF_flags_disableAsm)) { 1740 loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; 1741 } 1742 #endif 1743 1744 if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) { 1745 size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn); 1746 if (ret != 0) 1747 return ret; 1748 } 1749 return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); 1750 } 1751 1752 HUF_DGEN(HUF_decompress1X2_usingDTable_internal) 1753 1754 size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize, 1755 const void* cSrc, size_t cSrcSize, 1756 void* workSpace, size_t wkspSize, int flags) 1757 { 1758 const BYTE* ip = (const BYTE*) cSrc; 1759 1760 size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, 1761 workSpace, wkspSize, flags); 1762 if (HUF_isError(hSize)) return hSize; 1763 if (hSize >= cSrcSize) return ERROR(srcSize_wrong); 1764 ip += hSize; cSrcSize -= hSize; 1765 1766 return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags); 1767 } 1768 1769 static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, 1770 const void* cSrc, size_t cSrcSize, 1771 void* workSpace, size_t wkspSize, int flags) 1772 { 1773 const BYTE* ip = (const BYTE*) cSrc; 1774 1775 size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize, 1776 workSpace, wkspSize, flags); 1777 if (HUF_isError(hSize)) return hSize; 1778 if (hSize >= cSrcSize) return ERROR(srcSize_wrong); 1779 ip += hSize; cSrcSize -= hSize; 1780 1781 return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); 1782 } 1783 1784 #endif /* HUF_FORCE_DECOMPRESS_X1 */ 1785 1786 1787 /* ***********************************/ 1788 /* Universal decompression selectors */ 1789 /* ***********************************/ 1790 1791 1792 #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2) 1793 typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t; 1794 static const algo_time_t algoTime[16 /* Quantization */][2 /* single, double */] = 1795 { 1796 /* single, double, quad */ 1797 {{0,0}, {1,1}}, /* Q==0 : impossible */ 1798 {{0,0}, {1,1}}, /* Q==1 : impossible */ 1799 {{ 150,216}, { 381,119}}, /* Q == 2 : 12-18% */ 1800 {{ 170,205}, { 514,112}}, /* Q == 3 : 18-25% */ 1801 {{ 177,199}, { 539,110}}, /* Q == 4 : 25-32% */ 1802 {{ 197,194}, { 644,107}}, /* Q == 5 : 32-38% */ 1803 {{ 221,192}, { 735,107}}, /* Q == 6 : 38-44% */ 1804 {{ 256,189}, { 881,106}}, /* Q == 7 : 44-50% */ 1805 {{ 359,188}, {1167,109}}, /* Q == 8 : 50-56% */ 1806 {{ 582,187}, {1570,114}}, /* Q == 9 : 56-62% */ 1807 {{ 688,187}, {1712,122}}, /* Q ==10 : 62-69% */ 1808 {{ 825,186}, {1965,136}}, /* Q ==11 : 69-75% */ 1809 {{ 976,185}, {2131,150}}, /* Q ==12 : 75-81% */ 1810 {{1180,186}, {2070,175}}, /* Q ==13 : 81-87% */ 1811 {{1377,185}, {1731,202}}, /* Q ==14 : 87-93% */ 1812 {{1412,185}, {1695,202}}, /* Q ==15 : 93-99% */ 1813 }; 1814 #endif 1815 1816 /** HUF_selectDecoder() : 1817 * Tells which decoder is likely to decode faster, 1818 * based on a set of pre-computed metrics. 1819 * @return : 0==HUF_decompress4X1, 1==HUF_decompress4X2 . 1820 * Assumption : 0 < dstSize <= 128 KB */ 1821 U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize) 1822 { 1823 assert(dstSize > 0); 1824 assert(dstSize <= 128*1024); 1825 #if defined(HUF_FORCE_DECOMPRESS_X1) 1826 (void)dstSize; 1827 (void)cSrcSize; 1828 return 0; 1829 #elif defined(HUF_FORCE_DECOMPRESS_X2) 1830 (void)dstSize; 1831 (void)cSrcSize; 1832 return 1; 1833 #else 1834 /* decoder timing evaluation */ 1835 { U32 const Q = (cSrcSize >= dstSize) ? 15 : (U32)(cSrcSize * 16 / dstSize); /* Q < 16 */ 1836 U32 const D256 = (U32)(dstSize >> 8); 1837 U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256); 1838 U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256); 1839 DTime1 += DTime1 >> 5; /* small advantage to algorithm using less memory, to reduce cache eviction */ 1840 return DTime1 < DTime0; 1841 } 1842 #endif 1843 } 1844 1845 size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, 1846 const void* cSrc, size_t cSrcSize, 1847 void* workSpace, size_t wkspSize, int flags) 1848 { 1849 /* validation checks */ 1850 if (dstSize == 0) return ERROR(dstSize_tooSmall); 1851 if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */ 1852 if (cSrcSize == dstSize) { ZSTD_memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */ 1853 if (cSrcSize == 1) { ZSTD_memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */ 1854 1855 { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); 1856 #if defined(HUF_FORCE_DECOMPRESS_X1) 1857 (void)algoNb; 1858 assert(algoNb == 0); 1859 return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc, 1860 cSrcSize, workSpace, wkspSize, flags); 1861 #elif defined(HUF_FORCE_DECOMPRESS_X2) 1862 (void)algoNb; 1863 assert(algoNb == 1); 1864 return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, 1865 cSrcSize, workSpace, wkspSize, flags); 1866 #else 1867 return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, 1868 cSrcSize, workSpace, wkspSize, flags): 1869 HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc, 1870 cSrcSize, workSpace, wkspSize, flags); 1871 #endif 1872 } 1873 } 1874 1875 1876 size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags) 1877 { 1878 DTableDesc const dtd = HUF_getDTableDesc(DTable); 1879 #if defined(HUF_FORCE_DECOMPRESS_X1) 1880 (void)dtd; 1881 assert(dtd.tableType == 0); 1882 return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); 1883 #elif defined(HUF_FORCE_DECOMPRESS_X2) 1884 (void)dtd; 1885 assert(dtd.tableType == 1); 1886 return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); 1887 #else 1888 return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) : 1889 HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); 1890 #endif 1891 } 1892 1893 #ifndef HUF_FORCE_DECOMPRESS_X2 1894 size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags) 1895 { 1896 const BYTE* ip = (const BYTE*) cSrc; 1897 1898 size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags); 1899 if (HUF_isError(hSize)) return hSize; 1900 if (hSize >= cSrcSize) return ERROR(srcSize_wrong); 1901 ip += hSize; cSrcSize -= hSize; 1902 1903 return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); 1904 } 1905 #endif 1906 1907 size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags) 1908 { 1909 DTableDesc const dtd = HUF_getDTableDesc(DTable); 1910 #if defined(HUF_FORCE_DECOMPRESS_X1) 1911 (void)dtd; 1912 assert(dtd.tableType == 0); 1913 return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); 1914 #elif defined(HUF_FORCE_DECOMPRESS_X2) 1915 (void)dtd; 1916 assert(dtd.tableType == 1); 1917 return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); 1918 #else 1919 return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) : 1920 HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); 1921 #endif 1922 } 1923 1924 size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags) 1925 { 1926 /* validation checks */ 1927 if (dstSize == 0) return ERROR(dstSize_tooSmall); 1928 if (cSrcSize == 0) return ERROR(corruption_detected); 1929 1930 { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); 1931 #if defined(HUF_FORCE_DECOMPRESS_X1) 1932 (void)algoNb; 1933 assert(algoNb == 0); 1934 return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); 1935 #elif defined(HUF_FORCE_DECOMPRESS_X2) 1936 (void)algoNb; 1937 assert(algoNb == 1); 1938 return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); 1939 #else 1940 return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) : 1941 HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); 1942 #endif 1943 } 1944 } 1945