1 /* 2 * Copyright (c) Meta Platforms, Inc. and affiliates. 3 * All rights reserved. 4 * 5 * This source code is licensed under both the BSD-style license (found in the 6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found 7 * in the COPYING file in the root directory of this source tree). 8 * You may select, at your option, one of the above-listed licenses. 9 */ 10 11 #include <limits.h> 12 #include <math.h> 13 #include <stddef.h> 14 #include <stdio.h> 15 #include <stdlib.h> 16 #include <string.h> 17 #include <time.h> /* time(), for seed random initialization */ 18 19 #include "util.h" 20 #include "timefn.h" /* UTIL_clockSpanMicro, SEC_TO_MICRO, UTIL_TIME_INITIALIZER */ 21 #include "zstd.h" 22 #include "zstd_internal.h" 23 #include "mem.h" 24 #define ZDICT_STATIC_LINKING_ONLY 25 #include "zdict.h" 26 27 /* Direct access to internal compression functions is required */ 28 #include "compress/zstd_compress.c" /* ZSTD_resetSeqStore, ZSTD_storeSeq, *_TO_OFFBASE, HIST_countFast_wksp, HIST_isError */ 29 #include "decompress/zstd_decompress_block.h" /* ZSTD_decompressBlock_deprecated */ 30 31 #define XXH_STATIC_LINKING_ONLY 32 #include "xxhash.h" /* XXH64 */ 33 34 #if !(defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)) 35 # define inline /* disable */ 36 #endif 37 38 /*-************************************ 39 * DISPLAY Macros 40 **************************************/ 41 #define DISPLAY(...) fprintf(stderr, __VA_ARGS__) 42 #define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); } 43 static U32 g_displayLevel = 2; 44 45 #define DISPLAYUPDATE(...) \ 46 do { \ 47 if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || \ 48 (g_displayLevel >= 4)) { \ 49 g_displayClock = UTIL_getTime(); \ 50 DISPLAY(__VA_ARGS__); \ 51 if (g_displayLevel >= 4) fflush(stderr); \ 52 } \ 53 } while (0) 54 55 static const U64 g_refreshRate = SEC_TO_MICRO / 6; 56 static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; 57 58 #define CHECKERR(code) \ 59 do { \ 60 if (ZSTD_isError(code)) { \ 61 DISPLAY("Error occurred while generating data: %s\n", \ 62 ZSTD_getErrorName(code)); \ 63 exit(1); \ 64 } \ 65 } while (0) 66 67 68 /*-******************************************************* 69 * Random function 70 *********************************************************/ 71 static U32 RAND(U32* src) 72 { 73 #define RAND_rotl32(x,r) ((x << r) | (x >> (32 - r))) 74 static const U32 prime1 = 2654435761U; 75 static const U32 prime2 = 2246822519U; 76 U32 rand32 = *src; 77 rand32 *= prime1; 78 rand32 += prime2; 79 rand32 = RAND_rotl32(rand32, 13); 80 *src = rand32; 81 return RAND_rotl32(rand32, 27); 82 #undef RAND_rotl32 83 } 84 85 #define DISTSIZE (8192) 86 87 /* Write `size` bytes into `ptr`, all of which are less than or equal to `maxSymb` */ 88 static void RAND_bufferMaxSymb(U32* seed, void* ptr, size_t size, int maxSymb) 89 { 90 size_t i; 91 BYTE* op = ptr; 92 93 for (i = 0; i < size; i++) { 94 op[i] = (BYTE) (RAND(seed) % (maxSymb + 1)); 95 } 96 } 97 98 /* Write `size` random bytes into `ptr` */ 99 static void RAND_buffer(U32* seed, void* ptr, size_t size) 100 { 101 size_t i; 102 BYTE* op = ptr; 103 104 for (i = 0; i + 4 <= size; i += 4) { 105 MEM_writeLE32(op + i, RAND(seed)); 106 } 107 for (; i < size; i++) { 108 op[i] = RAND(seed) & 0xff; 109 } 110 } 111 112 /* Write `size` bytes into `ptr` following the distribution `dist` */ 113 static void RAND_bufferDist(U32* seed, BYTE* dist, void* ptr, size_t size) 114 { 115 size_t i; 116 BYTE* op = ptr; 117 118 for (i = 0; i < size; i++) { 119 op[i] = dist[RAND(seed) % DISTSIZE]; 120 } 121 } 122 123 /* Generate a random distribution where the frequency of each symbol follows a 124 * geometric distribution defined by `weight` 125 * `dist` should have size at least `DISTSIZE` */ 126 static void RAND_genDist(U32* seed, BYTE* dist, double weight) 127 { 128 size_t i = 0; 129 size_t statesLeft = DISTSIZE; 130 BYTE symb = (BYTE) (RAND(seed) % 256); 131 BYTE step = (BYTE) ((RAND(seed) % 256) | 1); /* force it to be odd so it's relatively prime to 256 */ 132 133 while (i < DISTSIZE) { 134 size_t states = ((size_t)(weight * (double)statesLeft)) + 1; 135 size_t j; 136 for (j = 0; j < states && i < DISTSIZE; j++, i++) { 137 dist[i] = symb; 138 } 139 140 symb += step; 141 statesLeft -= states; 142 } 143 } 144 145 /* Generates a random number in the range [min, max) */ 146 static inline U32 RAND_range(U32* seed, U32 min, U32 max) 147 { 148 return (RAND(seed) % (max-min)) + min; 149 } 150 151 #define ROUND(x) ((U32)(x + 0.5)) 152 153 /* Generates a random number in an exponential distribution with mean `mean` */ 154 static double RAND_exp(U32* seed, double mean) 155 { 156 double const u = RAND(seed) / (double) UINT_MAX; 157 return log(1-u) * (-mean); 158 } 159 160 /*-******************************************************* 161 * Constants and Structs 162 *********************************************************/ 163 const char* BLOCK_TYPES[] = {"raw", "rle", "compressed"}; 164 165 #define MAX_DECOMPRESSED_SIZE_LOG 20 166 #define MAX_DECOMPRESSED_SIZE (1ULL << MAX_DECOMPRESSED_SIZE_LOG) 167 168 #define MAX_WINDOW_LOG 22 /* Recommended support is 8MB, so limit to 4MB + mantissa */ 169 170 #define MIN_SEQ_LEN (3) 171 #define MAX_NB_SEQ ((ZSTD_BLOCKSIZE_MAX + MIN_SEQ_LEN - 1) / MIN_SEQ_LEN) 172 173 #ifndef MAX_PATH 174 #ifdef PATH_MAX 175 #define MAX_PATH PATH_MAX 176 #else 177 #define MAX_PATH 256 178 #endif 179 #endif 180 181 BYTE CONTENT_BUFFER[MAX_DECOMPRESSED_SIZE]; 182 BYTE FRAME_BUFFER[MAX_DECOMPRESSED_SIZE * 2]; 183 BYTE LITERAL_BUFFER[ZSTD_BLOCKSIZE_MAX]; 184 185 seqDef SEQUENCE_BUFFER[MAX_NB_SEQ]; 186 BYTE SEQUENCE_LITERAL_BUFFER[ZSTD_BLOCKSIZE_MAX]; /* storeSeq expects a place to copy literals to */ 187 BYTE SEQUENCE_LLCODE[ZSTD_BLOCKSIZE_MAX]; 188 BYTE SEQUENCE_MLCODE[ZSTD_BLOCKSIZE_MAX]; 189 BYTE SEQUENCE_OFCODE[ZSTD_BLOCKSIZE_MAX]; 190 191 U64 WKSP[HUF_WORKSPACE_SIZE_U64]; 192 193 typedef struct { 194 size_t contentSize; /* 0 means unknown (unless contentSize == windowSize == 0) */ 195 unsigned windowSize; /* contentSize >= windowSize means single segment */ 196 } frameHeader_t; 197 198 /* For repeat modes */ 199 typedef struct { 200 U32 rep[ZSTD_REP_NUM]; 201 202 int hufInit; 203 /* the distribution used in the previous block for repeat mode */ 204 BYTE hufDist[DISTSIZE]; 205 HUF_CElt hufTable [HUF_CTABLE_SIZE_ST(255)]; 206 207 int fseInit; 208 FSE_CTable offcodeCTable [FSE_CTABLE_SIZE_U32(OffFSELog, MaxOff)]; 209 FSE_CTable matchlengthCTable[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML)]; 210 FSE_CTable litlengthCTable [FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL)]; 211 212 /* Symbols that were present in the previous distribution, for use with 213 * set_repeat */ 214 BYTE litlengthSymbolSet[36]; 215 BYTE offsetSymbolSet[29]; 216 BYTE matchlengthSymbolSet[53]; 217 } cblockStats_t; 218 219 typedef struct { 220 void* data; 221 void* dataStart; 222 void* dataEnd; 223 224 void* src; 225 void* srcStart; 226 void* srcEnd; 227 228 frameHeader_t header; 229 230 cblockStats_t stats; 231 cblockStats_t oldStats; /* so they can be rolled back if uncompressible */ 232 } frame_t; 233 234 typedef struct { 235 int useDict; 236 U32 dictID; 237 size_t dictContentSize; 238 BYTE* dictContent; 239 } dictInfo; 240 241 typedef enum { 242 gt_frame = 0, /* generate frames */ 243 gt_block, /* generate compressed blocks without block/frame headers */ 244 } genType_e; 245 246 #ifndef MIN 247 #define MIN(a, b) ((a) < (b) ? (a) : (b)) 248 #endif 249 250 /*-******************************************************* 251 * Global variables (set from command line) 252 *********************************************************/ 253 U32 g_maxDecompressedSizeLog = MAX_DECOMPRESSED_SIZE_LOG; /* <= 20 */ 254 U32 g_maxBlockSize = ZSTD_BLOCKSIZE_MAX; /* <= 128 KB */ 255 256 /*-******************************************************* 257 * Generator Functions 258 *********************************************************/ 259 260 struct { 261 int contentSize; /* force the content size to be present */ 262 } opts; /* advanced options on generation */ 263 264 /* Generate and write a random frame header */ 265 static void writeFrameHeader(U32* seed, frame_t* frame, dictInfo info) 266 { 267 BYTE* const op = frame->data; 268 size_t pos = 0; 269 frameHeader_t fh; 270 271 BYTE windowByte = 0; 272 273 int singleSegment = 0; 274 int contentSizeFlag = 0; 275 int fcsCode = 0; 276 277 memset(&fh, 0, sizeof(fh)); 278 279 /* generate window size */ 280 { 281 /* Follow window algorithm from specification */ 282 int const exponent = RAND(seed) % (MAX_WINDOW_LOG - 10); 283 int const mantissa = RAND(seed) % 8; 284 windowByte = (BYTE) ((exponent << 3) | mantissa); 285 fh.windowSize = (1U << (exponent + 10)); 286 fh.windowSize += fh.windowSize / 8 * mantissa; 287 } 288 289 { 290 /* Generate random content size */ 291 size_t highBit; 292 if (RAND(seed) & 7 && g_maxDecompressedSizeLog > 7) { 293 /* do content of at least 128 bytes */ 294 highBit = 1ULL << RAND_range(seed, 7, g_maxDecompressedSizeLog); 295 } else if (RAND(seed) & 3) { 296 /* do small content */ 297 highBit = 1ULL << RAND_range(seed, 0, MIN(7, 1U << g_maxDecompressedSizeLog)); 298 } else { 299 /* 0 size frame */ 300 highBit = 0; 301 } 302 fh.contentSize = highBit ? highBit + (RAND(seed) % highBit) : 0; 303 304 /* provide size sometimes */ 305 contentSizeFlag = opts.contentSize | (RAND(seed) & 1); 306 307 if (contentSizeFlag && (fh.contentSize == 0 || !(RAND(seed) & 7))) { 308 /* do single segment sometimes */ 309 fh.windowSize = (U32) fh.contentSize; 310 singleSegment = 1; 311 } 312 } 313 314 if (contentSizeFlag) { 315 /* Determine how large fcs field has to be */ 316 int minFcsCode = (fh.contentSize >= 256) + 317 (fh.contentSize >= 65536 + 256) + 318 (fh.contentSize > 0xFFFFFFFFU); 319 if (!singleSegment && !minFcsCode) { 320 minFcsCode = 1; 321 } 322 fcsCode = minFcsCode + (RAND(seed) % (4 - minFcsCode)); 323 if (fcsCode == 1 && fh.contentSize < 256) fcsCode++; 324 } 325 326 /* write out the header */ 327 MEM_writeLE32(op + pos, ZSTD_MAGICNUMBER); 328 pos += 4; 329 330 { 331 /* 332 * fcsCode: 2-bit flag specifying how many bytes used to represent Frame_Content_Size (bits 7-6) 333 * singleSegment: 1-bit flag describing if data must be regenerated within a single continuous memory segment. (bit 5) 334 * contentChecksumFlag: 1-bit flag that is set if frame includes checksum at the end -- set to 1 below (bit 2) 335 * dictBits: 2-bit flag describing how many bytes Dictionary_ID uses -- set to 3 (bits 1-0) 336 * For more information: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame_header 337 */ 338 int const dictBits = info.useDict ? 3 : 0; 339 BYTE const frameHeaderDescriptor = 340 (BYTE) ((fcsCode << 6) | (singleSegment << 5) | (1 << 2) | dictBits); 341 op[pos++] = frameHeaderDescriptor; 342 } 343 344 if (!singleSegment) { 345 op[pos++] = windowByte; 346 } 347 if (info.useDict) { 348 MEM_writeLE32(op + pos, (U32) info.dictID); 349 pos += 4; 350 } 351 if (contentSizeFlag) { 352 switch (fcsCode) { 353 default: /* Impossible */ 354 case 0: op[pos++] = (BYTE) fh.contentSize; break; 355 case 1: MEM_writeLE16(op + pos, (U16) (fh.contentSize - 256)); pos += 2; break; 356 case 2: MEM_writeLE32(op + pos, (U32) fh.contentSize); pos += 4; break; 357 case 3: MEM_writeLE64(op + pos, (U64) fh.contentSize); pos += 8; break; 358 } 359 } 360 361 DISPLAYLEVEL(3, " frame content size:\t%u\n", (unsigned)fh.contentSize); 362 DISPLAYLEVEL(3, " frame window size:\t%u\n", fh.windowSize); 363 DISPLAYLEVEL(3, " content size flag:\t%d\n", contentSizeFlag); 364 DISPLAYLEVEL(3, " single segment flag:\t%d\n", singleSegment); 365 366 frame->data = op + pos; 367 frame->header = fh; 368 } 369 370 /* Write a literal block in either raw or RLE form, return the literals size */ 371 static size_t writeLiteralsBlockSimple(U32* seed, frame_t* frame, size_t contentSize) 372 { 373 BYTE* op = (BYTE*)frame->data; 374 int const type = RAND(seed) % 2; 375 int const sizeFormatDesc = RAND(seed) % 8; 376 size_t litSize; 377 size_t maxLitSize = MIN(contentSize, g_maxBlockSize); 378 379 if (sizeFormatDesc == 0) { 380 /* Size_FormatDesc = ?0 */ 381 maxLitSize = MIN(maxLitSize, 31); 382 } else if (sizeFormatDesc <= 4) { 383 /* Size_FormatDesc = 01 */ 384 maxLitSize = MIN(maxLitSize, 4095); 385 } else { 386 /* Size_Format = 11 */ 387 maxLitSize = MIN(maxLitSize, 1048575); 388 } 389 390 litSize = RAND(seed) % (maxLitSize + 1); 391 if (frame->src == frame->srcStart && litSize == 0) { 392 litSize = 1; /* no empty literals if there's nothing preceding this block */ 393 } 394 if (litSize + 3 > contentSize) { 395 litSize = contentSize; /* no matches shorter than 3 are allowed */ 396 } 397 /* use smallest size format that fits */ 398 if (litSize < 32) { 399 op[0] = (type | (0 << 2) | (litSize << 3)) & 0xff; 400 op += 1; 401 } else if (litSize < 4096) { 402 op[0] = (type | (1 << 2) | (litSize << 4)) & 0xff; 403 op[1] = (litSize >> 4) & 0xff; 404 op += 2; 405 } else { 406 op[0] = (type | (3 << 2) | (litSize << 4)) & 0xff; 407 op[1] = (litSize >> 4) & 0xff; 408 op[2] = (litSize >> 12) & 0xff; 409 op += 3; 410 } 411 412 if (type == 0) { 413 /* Raw literals */ 414 DISPLAYLEVEL(4, " raw literals\n"); 415 416 RAND_buffer(seed, LITERAL_BUFFER, litSize); 417 memcpy(op, LITERAL_BUFFER, litSize); 418 op += litSize; 419 } else { 420 /* RLE literals */ 421 BYTE const symb = (BYTE) (RAND(seed) % 256); 422 423 DISPLAYLEVEL(4, " rle literals: 0x%02x\n", (unsigned)symb); 424 425 memset(LITERAL_BUFFER, symb, litSize); 426 op[0] = symb; 427 op++; 428 } 429 430 frame->data = op; 431 432 return litSize; 433 } 434 435 /* Generate a Huffman header for the given source */ 436 static size_t writeHufHeader(U32* seed, HUF_CElt* hufTable, void* dst, size_t dstSize, 437 const void* src, size_t srcSize) 438 { 439 BYTE* const ostart = (BYTE*)dst; 440 BYTE* op = ostart; 441 442 unsigned huffLog = 11; 443 unsigned maxSymbolValue = 255; 444 445 unsigned count[HUF_SYMBOLVALUE_MAX+1]; 446 447 /* Scan input and build symbol stats */ 448 { size_t const largest = HIST_count_wksp (count, &maxSymbolValue, (const BYTE*)src, srcSize, WKSP, sizeof(WKSP)); 449 assert(!HIST_isError(largest)); 450 if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 0; } /* single symbol, rle */ 451 if (largest <= (srcSize >> 7)+1) return 0; /* Fast heuristic : not compressible enough */ 452 } 453 454 /* Build Huffman Tree */ 455 /* Max Huffman log is 11, min is highbit(maxSymbolValue)+1 */ 456 huffLog = RAND_range(seed, ZSTD_highbit32(maxSymbolValue)+1, huffLog+1); 457 DISPLAYLEVEL(6, " huffman log: %u\n", huffLog); 458 { size_t const maxBits = HUF_buildCTable_wksp (hufTable, count, maxSymbolValue, huffLog, WKSP, sizeof(WKSP)); 459 CHECKERR(maxBits); 460 huffLog = (U32)maxBits; 461 } 462 463 /* Write table description header */ 464 { size_t const hSize = HUF_writeCTable_wksp (op, dstSize, hufTable, maxSymbolValue, huffLog, WKSP, sizeof(WKSP)); 465 if (hSize + 12 >= srcSize) return 0; /* not useful to try compression */ 466 op += hSize; 467 } 468 469 return op - ostart; 470 } 471 472 /* Write a Huffman coded literals block and return the literals size */ 473 static size_t writeLiteralsBlockCompressed(U32* seed, frame_t* frame, size_t contentSize) 474 { 475 BYTE* origop = (BYTE*)frame->data; 476 BYTE* opend = (BYTE*)frame->dataEnd; 477 BYTE* op; 478 BYTE* const ostart = origop; 479 int const sizeFormat = RAND(seed) % 4; 480 size_t litSize; 481 size_t hufHeaderSize = 0; 482 size_t compressedSize = 0; 483 size_t maxLitSize = MIN(contentSize-3, g_maxBlockSize); 484 485 symbolEncodingType_e hType; 486 487 if (contentSize < 64) { 488 /* make sure we get reasonably-sized literals for compression */ 489 return ERROR(GENERIC); 490 } 491 492 DISPLAYLEVEL(4, " compressed literals\n"); 493 494 switch (sizeFormat) { 495 case 0: /* fall through, size is the same as case 1 */ 496 case 1: 497 maxLitSize = MIN(maxLitSize, 1023); 498 origop += 3; 499 break; 500 case 2: 501 maxLitSize = MIN(maxLitSize, 16383); 502 origop += 4; 503 break; 504 case 3: 505 maxLitSize = MIN(maxLitSize, 262143); 506 origop += 5; 507 break; 508 default:; /* impossible */ 509 } 510 511 do { 512 op = origop; 513 do { 514 litSize = RAND(seed) % (maxLitSize + 1); 515 } while (litSize < 32); /* avoid small literal sizes */ 516 if (litSize + 3 > contentSize) { 517 litSize = contentSize; /* no matches shorter than 3 are allowed */ 518 } 519 520 /* most of the time generate a new distribution */ 521 if ((RAND(seed) & 3) || !frame->stats.hufInit) { 522 do { 523 if (RAND(seed) & 3) { 524 /* add 10 to ensure some compressibility */ 525 double const weight = ((RAND(seed) % 90) + 10) / 100.0; 526 527 DISPLAYLEVEL(5, " distribution weight: %d%%\n", 528 (int)(weight * 100)); 529 530 RAND_genDist(seed, frame->stats.hufDist, weight); 531 } else { 532 /* sometimes do restricted range literals to force 533 * non-huffman headers */ 534 DISPLAYLEVEL(5, " small range literals\n"); 535 RAND_bufferMaxSymb(seed, frame->stats.hufDist, DISTSIZE, 536 15); 537 } 538 RAND_bufferDist(seed, frame->stats.hufDist, LITERAL_BUFFER, 539 litSize); 540 541 /* generate the header from the distribution instead of the 542 * actual data to avoid bugs with symbols that were in the 543 * distribution but never showed up in the output */ 544 hufHeaderSize = writeHufHeader( 545 seed, frame->stats.hufTable, op, opend - op, 546 frame->stats.hufDist, DISTSIZE); 547 CHECKERR(hufHeaderSize); 548 /* repeat until a valid header is written */ 549 } while (hufHeaderSize == 0); 550 op += hufHeaderSize; 551 hType = set_compressed; 552 553 frame->stats.hufInit = 1; 554 } else { 555 /* repeat the distribution/table from last time */ 556 DISPLAYLEVEL(5, " huffman repeat stats\n"); 557 RAND_bufferDist(seed, frame->stats.hufDist, LITERAL_BUFFER, 558 litSize); 559 hufHeaderSize = 0; 560 hType = set_repeat; 561 } 562 563 do { 564 compressedSize = 565 sizeFormat == 0 566 ? HUF_compress1X_usingCTable( 567 op, opend - op, LITERAL_BUFFER, litSize, 568 frame->stats.hufTable, /* flags */ 0) 569 : HUF_compress4X_usingCTable( 570 op, opend - op, LITERAL_BUFFER, litSize, 571 frame->stats.hufTable, /* flags */ 0); 572 CHECKERR(compressedSize); 573 /* this only occurs when it could not compress or similar */ 574 } while (compressedSize <= 0); 575 576 op += compressedSize; 577 578 compressedSize += hufHeaderSize; 579 DISPLAYLEVEL(5, " regenerated size: %u\n", (unsigned)litSize); 580 DISPLAYLEVEL(5, " compressed size: %u\n", (unsigned)compressedSize); 581 if (compressedSize >= litSize) { 582 DISPLAYLEVEL(5, " trying again\n"); 583 /* if we have to try again, reset the stats so we don't accidentally 584 * try to repeat a distribution we just made */ 585 frame->stats = frame->oldStats; 586 } else { 587 break; 588 } 589 } while (1); 590 591 /* write header */ 592 switch (sizeFormat) { 593 case 0: /* fall through, size is the same as case 1 */ 594 case 1: { 595 U32 const header = hType | (sizeFormat << 2) | ((U32)litSize << 4) | 596 ((U32)compressedSize << 14); 597 MEM_writeLE24(ostart, header); 598 break; 599 } 600 case 2: { 601 U32 const header = hType | (sizeFormat << 2) | ((U32)litSize << 4) | 602 ((U32)compressedSize << 18); 603 MEM_writeLE32(ostart, header); 604 break; 605 } 606 case 3: { 607 U32 const header = hType | (sizeFormat << 2) | ((U32)litSize << 4) | 608 ((U32)compressedSize << 22); 609 MEM_writeLE32(ostart, header); 610 ostart[4] = (BYTE)(compressedSize >> 10); 611 break; 612 } 613 default:; /* impossible */ 614 } 615 616 frame->data = op; 617 return litSize; 618 } 619 620 static size_t writeLiteralsBlock(U32* seed, frame_t* frame, size_t contentSize) 621 { 622 /* only do compressed for larger segments to avoid compressibility issues */ 623 if (RAND(seed) & 7 && contentSize >= 64) { 624 return writeLiteralsBlockCompressed(seed, frame, contentSize); 625 } else { 626 return writeLiteralsBlockSimple(seed, frame, contentSize); 627 } 628 } 629 630 static inline void initSeqStore(seqStore_t *seqStore) { 631 seqStore->maxNbSeq = MAX_NB_SEQ; 632 seqStore->maxNbLit = ZSTD_BLOCKSIZE_MAX; 633 seqStore->sequencesStart = SEQUENCE_BUFFER; 634 seqStore->litStart = SEQUENCE_LITERAL_BUFFER; 635 seqStore->llCode = SEQUENCE_LLCODE; 636 seqStore->mlCode = SEQUENCE_MLCODE; 637 seqStore->ofCode = SEQUENCE_OFCODE; 638 639 ZSTD_resetSeqStore(seqStore); 640 } 641 642 /* Randomly generate sequence commands */ 643 static U32 644 generateSequences(U32* seed, frame_t* frame, seqStore_t* seqStore, 645 size_t contentSize, size_t literalsSize, dictInfo info) 646 { 647 /* The total length of all the matches */ 648 size_t const remainingMatch = contentSize - literalsSize; 649 size_t excessMatch = 0; 650 U32 numSequences = 0; 651 U32 i; 652 653 const BYTE* literals = LITERAL_BUFFER; 654 BYTE* srcPtr = frame->src; 655 656 if (literalsSize != contentSize) { 657 /* each match must be at least MIN_SEQ_LEN, so this is the maximum 658 * number of sequences we can have */ 659 U32 const maxSequences = (U32)remainingMatch / MIN_SEQ_LEN; 660 numSequences = (RAND(seed) % maxSequences) + 1; 661 662 /* the extra match lengths we have to allocate to each sequence */ 663 excessMatch = remainingMatch - numSequences * MIN_SEQ_LEN; 664 } 665 666 DISPLAYLEVEL(5, " total match lengths: %u\n", (unsigned)remainingMatch); 667 for (i = 0; i < numSequences; i++) { 668 /* Generate match and literal lengths by exponential distribution to 669 * ensure nice numbers */ 670 U32 matchLen = 671 MIN_SEQ_LEN + 672 ROUND(RAND_exp(seed, (double)excessMatch / (double)(numSequences - i))); 673 U32 literalLen = 674 (RAND(seed) & 7) 675 ? ROUND(RAND_exp(seed, 676 (double)literalsSize / 677 (double)(numSequences - i))) 678 : 0; 679 /* actual offset, code to send, and point to copy up to when shifting 680 * codes in the repeat offsets history */ 681 U32 offset, offBase, repIndex; 682 683 /* bounds checks */ 684 matchLen = (U32) MIN(matchLen, excessMatch + MIN_SEQ_LEN); 685 literalLen = MIN(literalLen, (U32) literalsSize); 686 if (i == 0 && srcPtr == frame->srcStart && literalLen == 0) literalLen = 1; 687 if (i + 1 == numSequences) matchLen = MIN_SEQ_LEN + (U32) excessMatch; 688 689 memcpy(srcPtr, literals, literalLen); 690 srcPtr += literalLen; 691 do { 692 if (RAND(seed) & 7) { 693 /* do a normal offset */ 694 U32 const dataDecompressed = (U32)((BYTE*)srcPtr-(BYTE*)frame->srcStart); 695 offset = (RAND(seed) % 696 MIN(frame->header.windowSize, 697 (size_t)((BYTE*)srcPtr - (BYTE*)frame->srcStart))) + 698 1; 699 if (info.useDict && (RAND(seed) & 1) && i + 1 != numSequences && dataDecompressed < frame->header.windowSize) { 700 /* need to occasionally generate offsets that go past the start */ 701 /* including i+1 != numSequences because the last sequences has to adhere to predetermined contentSize */ 702 U32 lenPastStart = (RAND(seed) % info.dictContentSize) + 1; 703 offset = (U32)((BYTE*)srcPtr - (BYTE*)frame->srcStart)+lenPastStart; 704 if (offset > frame->header.windowSize) { 705 if (lenPastStart < MIN_SEQ_LEN) { 706 /* when offset > windowSize, matchLen bound by end of dictionary (lenPastStart) */ 707 /* this also means that lenPastStart must be greater than MIN_SEQ_LEN */ 708 /* make sure lenPastStart does not go past dictionary start though */ 709 lenPastStart = MIN(lenPastStart+MIN_SEQ_LEN, (U32)info.dictContentSize); 710 offset = (U32)((BYTE*)srcPtr - (BYTE*)frame->srcStart) + lenPastStart; 711 } 712 { U32 const matchLenBound = MIN(frame->header.windowSize, lenPastStart); 713 matchLen = MIN(matchLen, matchLenBound); 714 } 715 } 716 } 717 offBase = OFFSET_TO_OFFBASE(offset); 718 repIndex = 2; 719 } else { 720 /* do a repeat offset */ 721 U32 const randomRepIndex = RAND(seed) % 3; 722 offBase = REPCODE_TO_OFFBASE(randomRepIndex + 1); /* expects values between 1 & 3 */ 723 if (literalLen > 0) { 724 offset = frame->stats.rep[randomRepIndex]; 725 repIndex = randomRepIndex; 726 } else { 727 /* special case : literalLen == 0 */ 728 offset = randomRepIndex == 2 ? frame->stats.rep[0] - 1 729 : frame->stats.rep[randomRepIndex + 1]; 730 repIndex = MIN(2, randomRepIndex + 1); 731 } 732 } 733 } while (((!info.useDict) && (offset > (size_t)((BYTE*)srcPtr - (BYTE*)frame->srcStart))) || offset == 0); 734 735 { BYTE* const dictEnd = ZSTD_maybeNullPtrAdd(info.dictContent, info.dictContentSize); 736 size_t j; 737 for (j = 0; j < matchLen; j++) { 738 if ((U32)((BYTE*)srcPtr - (BYTE*)frame->srcStart) < offset) { 739 /* copy from dictionary instead of literals */ 740 size_t const dictOffset = offset - (srcPtr - (BYTE*)frame->srcStart); 741 *srcPtr = *(dictEnd - dictOffset); 742 } 743 else { 744 *srcPtr = *(srcPtr-offset); 745 } 746 srcPtr++; 747 } } 748 749 { int r; 750 for (r = repIndex; r > 0; r--) { 751 frame->stats.rep[r] = frame->stats.rep[r - 1]; 752 } 753 frame->stats.rep[0] = offset; 754 } 755 756 DISPLAYLEVEL(6, " LL: %5u OF: %5u ML: %5u", 757 (unsigned)literalLen, (unsigned)offset, (unsigned)matchLen); 758 DISPLAYLEVEL(7, " srcPos: %8u seqNb: %3u", 759 (unsigned)((BYTE*)srcPtr - (BYTE*)frame->srcStart), (unsigned)i); 760 DISPLAYLEVEL(6, "\n"); 761 if (OFFBASE_IS_REPCODE(offBase)) { /* expects sumtype numeric representation of ZSTD_storeSeq() */ 762 DISPLAYLEVEL(7, " repeat offset: %d\n", (int)repIndex); 763 } 764 /* use libzstd sequence handling */ 765 ZSTD_storeSeq(seqStore, literalLen, literals, literals + literalLen, 766 offBase, matchLen); 767 768 literalsSize -= literalLen; 769 excessMatch -= (matchLen - MIN_SEQ_LEN); 770 literals += literalLen; 771 } 772 773 memcpy(srcPtr, literals, literalsSize); 774 srcPtr += literalsSize; 775 DISPLAYLEVEL(6, " excess literals: %5u ", (unsigned)literalsSize); 776 DISPLAYLEVEL(7, "srcPos: %8u ", (unsigned)((BYTE*)srcPtr - (BYTE*)frame->srcStart)); 777 DISPLAYLEVEL(6, "\n"); 778 779 return numSequences; 780 } 781 782 static void initSymbolSet(const BYTE* symbols, size_t len, BYTE* set, BYTE maxSymbolValue) 783 { 784 size_t i; 785 786 memset(set, 0, (size_t)maxSymbolValue+1); 787 788 for (i = 0; i < len; i++) { 789 set[symbols[i]] = 1; 790 } 791 } 792 793 static int isSymbolSubset(const BYTE* symbols, size_t len, const BYTE* set, BYTE maxSymbolValue) 794 { 795 size_t i; 796 797 for (i = 0; i < len; i++) { 798 if (symbols[i] > maxSymbolValue || !set[symbols[i]]) { 799 return 0; 800 } 801 } 802 return 1; 803 } 804 805 static size_t writeSequences(U32* seed, frame_t* frame, seqStore_t* seqStorePtr, 806 size_t nbSeq) 807 { 808 /* This code is mostly copied from ZSTD_compressSequences in zstd_compress.c */ 809 unsigned count[MaxSeq+1]; 810 S16 norm[MaxSeq+1]; 811 FSE_CTable* CTable_LitLength = frame->stats.litlengthCTable; 812 FSE_CTable* CTable_OffsetBits = frame->stats.offcodeCTable; 813 FSE_CTable* CTable_MatchLength = frame->stats.matchlengthCTable; 814 U32 LLtype, Offtype, MLtype; /* compressed, raw or rle */ 815 const seqDef* const sequences = seqStorePtr->sequencesStart; 816 const BYTE* const ofCodeTable = seqStorePtr->ofCode; 817 const BYTE* const llCodeTable = seqStorePtr->llCode; 818 const BYTE* const mlCodeTable = seqStorePtr->mlCode; 819 BYTE* const oend = (BYTE*)frame->dataEnd; 820 BYTE* op = (BYTE*)frame->data; 821 BYTE* seqHead; 822 BYTE scratchBuffer[FSE_BUILD_CTABLE_WORKSPACE_SIZE(MaxSeq, MaxFSELog)]; 823 824 /* literals compressing block removed so that can be done separately */ 825 826 /* Sequences Header */ 827 if ((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead */) return ERROR(dstSize_tooSmall); 828 if (nbSeq < 128) *op++ = (BYTE)nbSeq; 829 else if (nbSeq < LONGNBSEQ) op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2; 830 else op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3; 831 832 if (nbSeq==0) { 833 frame->data = op; 834 return 0; 835 } 836 837 /* seqHead : flags for FSE encoding type */ 838 seqHead = op++; 839 840 /* convert length/distances into codes */ 841 ZSTD_seqToCodes(seqStorePtr); 842 843 /* CTable for Literal Lengths */ 844 { unsigned max = MaxLL; 845 size_t const mostFrequent = HIST_countFast_wksp(count, &max, llCodeTable, nbSeq, WKSP, sizeof(WKSP)); /* cannot fail */ 846 assert(!HIST_isError(mostFrequent)); 847 if (frame->stats.fseInit && !(RAND(seed) & 3) && 848 isSymbolSubset(llCodeTable, nbSeq, 849 frame->stats.litlengthSymbolSet, 35)) { 850 /* maybe do repeat mode if we're allowed to */ 851 LLtype = set_repeat; 852 } else if (mostFrequent == nbSeq) { 853 /* do RLE if we have the chance */ 854 *op++ = llCodeTable[0]; 855 FSE_buildCTable_rle(CTable_LitLength, (BYTE)max); 856 LLtype = set_rle; 857 } else if (!(RAND(seed) & 3)) { 858 /* maybe use the default distribution */ 859 CHECKERR(FSE_buildCTable_wksp(CTable_LitLength, LL_defaultNorm, MaxLL, LL_defaultNormLog, scratchBuffer, sizeof(scratchBuffer))); 860 LLtype = set_basic; 861 } else { 862 /* fall back on a full table */ 863 size_t nbSeq_1 = nbSeq; 864 const U32 tableLog = FSE_optimalTableLog(LLFSELog, nbSeq, max); 865 if (count[llCodeTable[nbSeq-1]]>1) { count[llCodeTable[nbSeq-1]]--; nbSeq_1--; } 866 FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max, nbSeq >= 2048); 867 { size_t const NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog); /* overflow protected */ 868 if (FSE_isError(NCountSize)) return ERROR(GENERIC); 869 op += NCountSize; } 870 CHECKERR(FSE_buildCTable_wksp(CTable_LitLength, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer))); 871 LLtype = set_compressed; 872 } } 873 874 /* CTable for Offsets */ 875 /* see Literal Lengths for descriptions of mode choices */ 876 { unsigned max = MaxOff; 877 size_t const mostFrequent = HIST_countFast_wksp(count, &max, ofCodeTable, nbSeq, WKSP, sizeof(WKSP)); /* cannot fail */ 878 assert(!HIST_isError(mostFrequent)); 879 if (frame->stats.fseInit && !(RAND(seed) & 3) && 880 isSymbolSubset(ofCodeTable, nbSeq, 881 frame->stats.offsetSymbolSet, 28)) { 882 Offtype = set_repeat; 883 } else if (mostFrequent == nbSeq) { 884 *op++ = ofCodeTable[0]; 885 FSE_buildCTable_rle(CTable_OffsetBits, (BYTE)max); 886 Offtype = set_rle; 887 } else if (!(RAND(seed) & 3)) { 888 FSE_buildCTable_wksp(CTable_OffsetBits, OF_defaultNorm, DefaultMaxOff, OF_defaultNormLog, scratchBuffer, sizeof(scratchBuffer)); 889 Offtype = set_basic; 890 } else { 891 size_t nbSeq_1 = nbSeq; 892 const U32 tableLog = FSE_optimalTableLog(OffFSELog, nbSeq, max); 893 if (count[ofCodeTable[nbSeq-1]]>1) { count[ofCodeTable[nbSeq-1]]--; nbSeq_1--; } 894 FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max, nbSeq >= 2048); 895 { size_t const NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog); /* overflow protected */ 896 if (FSE_isError(NCountSize)) return ERROR(GENERIC); 897 op += NCountSize; } 898 FSE_buildCTable_wksp(CTable_OffsetBits, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer)); 899 Offtype = set_compressed; 900 } } 901 902 /* CTable for MatchLengths */ 903 /* see Literal Lengths for descriptions of mode choices */ 904 { unsigned max = MaxML; 905 size_t const mostFrequent = HIST_countFast_wksp(count, &max, mlCodeTable, nbSeq, WKSP, sizeof(WKSP)); /* cannot fail */ 906 assert(!HIST_isError(mostFrequent)); 907 if (frame->stats.fseInit && !(RAND(seed) & 3) && 908 isSymbolSubset(mlCodeTable, nbSeq, 909 frame->stats.matchlengthSymbolSet, 52)) { 910 MLtype = set_repeat; 911 } else if (mostFrequent == nbSeq) { 912 *op++ = *mlCodeTable; 913 FSE_buildCTable_rle(CTable_MatchLength, (BYTE)max); 914 MLtype = set_rle; 915 } else if (!(RAND(seed) & 3)) { 916 /* sometimes do default distribution */ 917 FSE_buildCTable_wksp(CTable_MatchLength, ML_defaultNorm, MaxML, ML_defaultNormLog, scratchBuffer, sizeof(scratchBuffer)); 918 MLtype = set_basic; 919 } else { 920 /* fall back on table */ 921 size_t nbSeq_1 = nbSeq; 922 const U32 tableLog = FSE_optimalTableLog(MLFSELog, nbSeq, max); 923 if (count[mlCodeTable[nbSeq-1]]>1) { count[mlCodeTable[nbSeq-1]]--; nbSeq_1--; } 924 FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max, nbSeq >= 2048); 925 { size_t const NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog); /* overflow protected */ 926 if (FSE_isError(NCountSize)) return ERROR(GENERIC); 927 op += NCountSize; } 928 FSE_buildCTable_wksp(CTable_MatchLength, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer)); 929 MLtype = set_compressed; 930 } } 931 frame->stats.fseInit = 1; 932 initSymbolSet(llCodeTable, nbSeq, frame->stats.litlengthSymbolSet, 35); 933 initSymbolSet(ofCodeTable, nbSeq, frame->stats.offsetSymbolSet, 28); 934 initSymbolSet(mlCodeTable, nbSeq, frame->stats.matchlengthSymbolSet, 52); 935 936 DISPLAYLEVEL(5, " LL type: %d OF type: %d ML type: %d\n", (unsigned)LLtype, (unsigned)Offtype, (unsigned)MLtype); 937 938 *seqHead = (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2)); 939 940 /* Encoding Sequences */ 941 { BIT_CStream_t blockStream; 942 FSE_CState_t stateMatchLength; 943 FSE_CState_t stateOffsetBits; 944 FSE_CState_t stateLitLength; 945 946 RETURN_ERROR_IF( 947 ERR_isError(BIT_initCStream(&blockStream, op, oend-op)), 948 dstSize_tooSmall, "not enough space remaining"); 949 950 /* first symbols */ 951 FSE_initCState2(&stateMatchLength, CTable_MatchLength, mlCodeTable[nbSeq-1]); 952 FSE_initCState2(&stateOffsetBits, CTable_OffsetBits, ofCodeTable[nbSeq-1]); 953 FSE_initCState2(&stateLitLength, CTable_LitLength, llCodeTable[nbSeq-1]); 954 BIT_addBits(&blockStream, sequences[nbSeq-1].litLength, LL_bits[llCodeTable[nbSeq-1]]); 955 if (MEM_32bits()) BIT_flushBits(&blockStream); 956 BIT_addBits(&blockStream, sequences[nbSeq-1].mlBase, ML_bits[mlCodeTable[nbSeq-1]]); 957 if (MEM_32bits()) BIT_flushBits(&blockStream); 958 BIT_addBits(&blockStream, sequences[nbSeq-1].offBase, ofCodeTable[nbSeq-1]); 959 BIT_flushBits(&blockStream); 960 961 { size_t n; 962 for (n=nbSeq-2 ; n<nbSeq ; n--) { /* intentional underflow */ 963 BYTE const llCode = llCodeTable[n]; 964 BYTE const ofCode = ofCodeTable[n]; 965 BYTE const mlCode = mlCodeTable[n]; 966 U32 const llBits = LL_bits[llCode]; 967 U32 const ofBits = ofCode; /* 32b*/ /* 64b*/ 968 U32 const mlBits = ML_bits[mlCode]; 969 /* (7)*/ /* (7)*/ 970 FSE_encodeSymbol(&blockStream, &stateOffsetBits, ofCode); /* 15 */ /* 15 */ 971 FSE_encodeSymbol(&blockStream, &stateMatchLength, mlCode); /* 24 */ /* 24 */ 972 if (MEM_32bits()) BIT_flushBits(&blockStream); /* (7)*/ 973 FSE_encodeSymbol(&blockStream, &stateLitLength, llCode); /* 16 */ /* 33 */ 974 if (MEM_32bits() || (ofBits+mlBits+llBits >= 64-7-(LLFSELog+MLFSELog+OffFSELog))) 975 BIT_flushBits(&blockStream); /* (7)*/ 976 BIT_addBits(&blockStream, sequences[n].litLength, llBits); 977 if (MEM_32bits() && ((llBits+mlBits)>24)) BIT_flushBits(&blockStream); 978 BIT_addBits(&blockStream, sequences[n].mlBase, mlBits); 979 if (MEM_32bits()) BIT_flushBits(&blockStream); /* (7)*/ 980 BIT_addBits(&blockStream, sequences[n].offBase, ofBits); /* 31 */ 981 BIT_flushBits(&blockStream); /* (7)*/ 982 } } 983 984 FSE_flushCState(&blockStream, &stateMatchLength); 985 FSE_flushCState(&blockStream, &stateOffsetBits); 986 FSE_flushCState(&blockStream, &stateLitLength); 987 988 { size_t const streamSize = BIT_closeCStream(&blockStream); 989 if (streamSize==0) return ERROR(dstSize_tooSmall); /* not enough space */ 990 op += streamSize; 991 } } 992 993 frame->data = op; 994 995 return 0; 996 } 997 998 static size_t writeSequencesBlock(U32* seed, frame_t* frame, size_t contentSize, 999 size_t literalsSize, dictInfo info) 1000 { 1001 seqStore_t seqStore; 1002 size_t numSequences; 1003 1004 1005 initSeqStore(&seqStore); 1006 1007 /* randomly generate sequences */ 1008 numSequences = generateSequences(seed, frame, &seqStore, contentSize, literalsSize, info); 1009 /* write them out to the frame data */ 1010 CHECKERR(writeSequences(seed, frame, &seqStore, numSequences)); 1011 1012 return numSequences; 1013 } 1014 1015 static size_t writeCompressedBlock(U32* seed, frame_t* frame, size_t contentSize, dictInfo info) 1016 { 1017 BYTE* const blockStart = (BYTE*)frame->data; 1018 size_t literalsSize; 1019 size_t nbSeq; 1020 1021 DISPLAYLEVEL(4, " compressed block:\n"); 1022 1023 literalsSize = writeLiteralsBlock(seed, frame, contentSize); 1024 1025 DISPLAYLEVEL(4, " literals size: %u\n", (unsigned)literalsSize); 1026 1027 nbSeq = writeSequencesBlock(seed, frame, contentSize, literalsSize, info); 1028 1029 DISPLAYLEVEL(4, " number of sequences: %u\n", (unsigned)nbSeq); 1030 1031 return (BYTE*)frame->data - blockStart; 1032 } 1033 1034 static void writeBlock(U32* seed, frame_t* frame, size_t contentSize, 1035 int lastBlock, dictInfo info) 1036 { 1037 int const blockTypeDesc = RAND(seed) % 8; 1038 size_t blockSize; 1039 int blockType; 1040 1041 BYTE *const header = (BYTE*)frame->data; 1042 BYTE *op = header + 3; 1043 1044 DISPLAYLEVEL(4, " block:\n"); 1045 DISPLAYLEVEL(4, " block content size: %u\n", (unsigned)contentSize); 1046 DISPLAYLEVEL(4, " last block: %s\n", lastBlock ? "yes" : "no"); 1047 1048 if (blockTypeDesc == 0) { 1049 /* Raw data frame */ 1050 1051 RAND_buffer(seed, frame->src, contentSize); 1052 memcpy(op, frame->src, contentSize); 1053 1054 op += contentSize; 1055 blockType = 0; 1056 blockSize = contentSize; 1057 } else if (blockTypeDesc == 1 && frame->header.contentSize > 0) { 1058 /* RLE (Don't create RLE block if frame content is 0 since block size of 1 may exceed max block size)*/ 1059 BYTE const symbol = RAND(seed) & 0xff; 1060 1061 op[0] = symbol; 1062 memset(frame->src, symbol, contentSize); 1063 1064 op++; 1065 blockType = 1; 1066 blockSize = contentSize; 1067 } else { 1068 /* compressed, most common */ 1069 size_t compressedSize; 1070 blockType = 2; 1071 1072 frame->oldStats = frame->stats; 1073 1074 frame->data = op; 1075 compressedSize = writeCompressedBlock(seed, frame, contentSize, info); 1076 if (compressedSize >= contentSize) { /* compressed block must be strictly smaller than uncompressed one */ 1077 blockType = 0; 1078 memcpy(op, frame->src, contentSize); 1079 1080 op += contentSize; 1081 blockSize = contentSize; /* fall back on raw block if data doesn't 1082 compress */ 1083 1084 frame->stats = frame->oldStats; /* don't update the stats */ 1085 } else { 1086 op += compressedSize; 1087 blockSize = compressedSize; 1088 } 1089 } 1090 frame->src = (BYTE*)frame->src + contentSize; 1091 1092 DISPLAYLEVEL(4, " block type: %s\n", BLOCK_TYPES[blockType]); 1093 DISPLAYLEVEL(4, " block size field: %u\n", (unsigned)blockSize); 1094 1095 header[0] = (BYTE) ((lastBlock | (blockType << 1) | (blockSize << 3)) & 0xff); 1096 MEM_writeLE16(header + 1, (U16) (blockSize >> 5)); 1097 1098 frame->data = op; 1099 } 1100 1101 static void writeBlocks(U32* seed, frame_t* frame, dictInfo info) 1102 { 1103 size_t contentLeft = frame->header.contentSize; 1104 size_t const maxBlockSize = MIN(g_maxBlockSize, frame->header.windowSize); 1105 while (1) { 1106 /* 1 in 4 chance of ending frame */ 1107 int const lastBlock = contentLeft > maxBlockSize ? 0 : !(RAND(seed) & 3); 1108 size_t blockContentSize; 1109 if (lastBlock) { 1110 blockContentSize = contentLeft; 1111 } else { 1112 if (contentLeft > 0 && (RAND(seed) & 7)) { 1113 /* some variable size block */ 1114 blockContentSize = RAND(seed) % (MIN(maxBlockSize, contentLeft)+1); 1115 } else if (contentLeft > maxBlockSize && (RAND(seed) & 1)) { 1116 /* some full size block */ 1117 blockContentSize = maxBlockSize; 1118 } else { 1119 /* some empty block */ 1120 blockContentSize = 0; 1121 } 1122 } 1123 1124 writeBlock(seed, frame, blockContentSize, lastBlock, info); 1125 1126 contentLeft -= blockContentSize; 1127 if (lastBlock) break; 1128 } 1129 } 1130 1131 static void writeChecksum(frame_t* frame) 1132 { 1133 /* write checksum so implementations can verify their output */ 1134 U64 digest = XXH64(frame->srcStart, (BYTE*)frame->src-(BYTE*)frame->srcStart, 0); 1135 DISPLAYLEVEL(3, " checksum: %08x\n", (unsigned)digest); 1136 MEM_writeLE32(frame->data, (U32)digest); 1137 frame->data = (BYTE*)frame->data + 4; 1138 } 1139 1140 static void outputBuffer(const void* buf, size_t size, const char* const path) 1141 { 1142 /* write data out to file */ 1143 const BYTE* ip = (const BYTE*)buf; 1144 FILE* out; 1145 if (path) { 1146 out = fopen(path, "wb"); 1147 } else { 1148 out = stdout; 1149 } 1150 if (!out) { 1151 fprintf(stderr, "Failed to open file at %s: ", path); 1152 perror(NULL); 1153 exit(1); 1154 } 1155 1156 { size_t fsize = size; 1157 size_t written = 0; 1158 while (written < fsize) { 1159 written += fwrite(ip + written, 1, fsize - written, out); 1160 if (ferror(out)) { 1161 fprintf(stderr, "Failed to write to file at %s: ", path); 1162 perror(NULL); 1163 exit(1); 1164 } 1165 } 1166 } 1167 1168 if (path) { 1169 fclose(out); 1170 } 1171 } 1172 1173 static void initFrame(frame_t* fr) 1174 { 1175 memset(fr, 0, sizeof(*fr)); 1176 fr->data = fr->dataStart = FRAME_BUFFER; 1177 fr->dataEnd = FRAME_BUFFER + sizeof(FRAME_BUFFER); 1178 fr->src = fr->srcStart = CONTENT_BUFFER; 1179 fr->srcEnd = CONTENT_BUFFER + sizeof(CONTENT_BUFFER); 1180 1181 /* init repeat codes */ 1182 fr->stats.rep[0] = 1; 1183 fr->stats.rep[1] = 4; 1184 fr->stats.rep[2] = 8; 1185 } 1186 1187 /** 1188 * Generated a single zstd compressed block with no block/frame header. 1189 * Returns the final seed. 1190 */ 1191 static U32 generateCompressedBlock(U32 seed, frame_t* frame, dictInfo info) 1192 { 1193 size_t blockContentSize; 1194 int blockWritten = 0; 1195 BYTE* op; 1196 DISPLAYLEVEL(4, "block seed: %u\n", (unsigned)seed); 1197 initFrame(frame); 1198 op = (BYTE*)frame->data; 1199 1200 while (!blockWritten) { 1201 size_t cSize; 1202 /* generate window size */ 1203 { int const exponent = RAND(&seed) % (MAX_WINDOW_LOG - 10); 1204 int const mantissa = RAND(&seed) % 8; 1205 frame->header.windowSize = (1U << (exponent + 10)); 1206 frame->header.windowSize += (frame->header.windowSize / 8) * mantissa; 1207 } 1208 1209 /* generate content size */ 1210 { size_t const maxBlockSize = MIN(g_maxBlockSize, frame->header.windowSize); 1211 if (RAND(&seed) & 15) { 1212 /* some full size blocks */ 1213 blockContentSize = maxBlockSize; 1214 } else if (RAND(&seed) & 7 && g_maxBlockSize >= (1U << 7)) { 1215 /* some small blocks <= 128 bytes*/ 1216 blockContentSize = RAND(&seed) % (1U << 7); 1217 } else { 1218 /* some variable size blocks */ 1219 blockContentSize = RAND(&seed) % maxBlockSize; 1220 } 1221 } 1222 1223 /* try generating a compressed block */ 1224 frame->oldStats = frame->stats; 1225 frame->data = op; 1226 cSize = writeCompressedBlock(&seed, frame, blockContentSize, info); 1227 if (cSize >= blockContentSize) { /* compressed size must be strictly smaller than decompressed size : https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#blocks */ 1228 /* data doesn't compress -- try again */ 1229 frame->stats = frame->oldStats; /* don't update the stats */ 1230 DISPLAYLEVEL(5, " can't compress block : try again \n"); 1231 } else { 1232 blockWritten = 1; 1233 DISPLAYLEVEL(4, " block size: %u \n", (unsigned)cSize); 1234 frame->src = (BYTE*)frame->src + blockContentSize; 1235 } 1236 } 1237 return seed; 1238 } 1239 1240 /* Return the final seed */ 1241 static U32 generateFrame(U32 seed, frame_t* fr, dictInfo info) 1242 { 1243 /* generate a complete frame */ 1244 DISPLAYLEVEL(3, "frame seed: %u\n", (unsigned)seed); 1245 initFrame(fr); 1246 1247 writeFrameHeader(&seed, fr, info); 1248 writeBlocks(&seed, fr, info); 1249 writeChecksum(fr); 1250 1251 return seed; 1252 } 1253 1254 /*_******************************************************* 1255 * Dictionary Helper Functions 1256 *********************************************************/ 1257 /* returns 0 if successful, otherwise returns 1 upon error */ 1258 static int genRandomDict(U32 dictID, U32 seed, size_t dictSize, BYTE* fullDict) 1259 { 1260 /* allocate space for samples */ 1261 int ret = 0; 1262 unsigned const numSamples = 4; 1263 size_t sampleSizes[4]; 1264 BYTE* const samples = malloc(5000*sizeof(BYTE)); 1265 if (samples == NULL) { 1266 DISPLAY("Error: could not allocate space for samples\n"); 1267 return 1; 1268 } 1269 1270 /* generate samples */ 1271 { unsigned literalValue = 1; 1272 unsigned samplesPos = 0; 1273 size_t currSize = 1; 1274 while (literalValue <= 4) { 1275 sampleSizes[literalValue - 1] = currSize; 1276 { size_t k; 1277 for (k = 0; k < currSize; k++) { 1278 *(samples + (samplesPos++)) = (BYTE)literalValue; 1279 } } 1280 literalValue++; 1281 currSize *= 16; 1282 } } 1283 1284 { size_t dictWriteSize = 0; 1285 ZDICT_params_t zdictParams; 1286 size_t const headerSize = MAX(dictSize/4, 256); 1287 size_t const dictContentSize = dictSize - headerSize; 1288 BYTE* const dictContent = fullDict + headerSize; 1289 if (dictContentSize < ZDICT_CONTENTSIZE_MIN || dictSize < ZDICT_DICTSIZE_MIN) { 1290 DISPLAY("Error: dictionary size is too small\n"); 1291 ret = 1; 1292 goto exitGenRandomDict; 1293 } 1294 1295 /* init dictionary params */ 1296 memset(&zdictParams, 0, sizeof(zdictParams)); 1297 zdictParams.dictID = dictID; 1298 zdictParams.notificationLevel = 1; 1299 1300 /* fill in dictionary content */ 1301 RAND_buffer(&seed, (void*)dictContent, dictContentSize); 1302 1303 /* finalize dictionary with random samples */ 1304 dictWriteSize = ZDICT_finalizeDictionary(fullDict, dictSize, 1305 dictContent, dictContentSize, 1306 samples, sampleSizes, numSamples, 1307 zdictParams); 1308 1309 if (ZDICT_isError(dictWriteSize)) { 1310 DISPLAY("Could not finalize dictionary: %s\n", ZDICT_getErrorName(dictWriteSize)); 1311 ret = 1; 1312 } 1313 } 1314 1315 exitGenRandomDict: 1316 free(samples); 1317 return ret; 1318 } 1319 1320 static dictInfo initDictInfo(int useDict, size_t dictContentSize, BYTE* dictContent, U32 dictID){ 1321 /* allocate space statically */ 1322 dictInfo dictOp; 1323 memset(&dictOp, 0, sizeof(dictOp)); 1324 dictOp.useDict = useDict; 1325 dictOp.dictContentSize = dictContentSize; 1326 dictOp.dictContent = dictContent; 1327 dictOp.dictID = dictID; 1328 return dictOp; 1329 } 1330 1331 /*-******************************************************* 1332 * Test Mode 1333 *********************************************************/ 1334 1335 BYTE DECOMPRESSED_BUFFER[MAX_DECOMPRESSED_SIZE]; 1336 1337 static size_t testDecodeSimple(frame_t* fr) 1338 { 1339 /* test decoding the generated data with the simple API */ 1340 size_t const ret = ZSTD_decompress(DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE, 1341 fr->dataStart, (BYTE*)fr->data - (BYTE*)fr->dataStart); 1342 1343 if (ZSTD_isError(ret)) return ret; 1344 1345 if (memcmp(DECOMPRESSED_BUFFER, fr->srcStart, 1346 (BYTE*)fr->src - (BYTE*)fr->srcStart) != 0) { 1347 return ERROR(corruption_detected); 1348 } 1349 1350 return ret; 1351 } 1352 1353 static size_t testDecodeStreaming(frame_t* fr) 1354 { 1355 /* test decoding the generated data with the streaming API */ 1356 ZSTD_DStream* zd = ZSTD_createDStream(); 1357 ZSTD_inBuffer in; 1358 ZSTD_outBuffer out; 1359 size_t ret; 1360 1361 if (!zd) return ERROR(memory_allocation); 1362 1363 in.src = fr->dataStart; 1364 in.pos = 0; 1365 in.size = (BYTE*)fr->data - (BYTE*)fr->dataStart; 1366 1367 out.dst = DECOMPRESSED_BUFFER; 1368 out.pos = 0; 1369 out.size = ZSTD_DStreamOutSize(); 1370 1371 ZSTD_initDStream(zd); 1372 while (1) { 1373 ret = ZSTD_decompressStream(zd, &out, &in); 1374 if (ZSTD_isError(ret)) goto cleanup; /* error */ 1375 if (ret == 0) break; /* frame is done */ 1376 1377 /* force decoding to be done in chunks */ 1378 out.size += MIN(ZSTD_DStreamOutSize(), MAX_DECOMPRESSED_SIZE - out.size); 1379 } 1380 1381 ret = out.pos; 1382 1383 if (memcmp(out.dst, fr->srcStart, out.pos) != 0) { 1384 return ERROR(corruption_detected); 1385 } 1386 1387 cleanup: 1388 ZSTD_freeDStream(zd); 1389 return ret; 1390 } 1391 1392 static size_t testDecodeWithDict(U32 seed, genType_e genType) 1393 { 1394 /* create variables */ 1395 size_t const dictSize = RAND(&seed) % (10 << 20) + ZDICT_DICTSIZE_MIN + ZDICT_CONTENTSIZE_MIN; 1396 U32 const dictID = RAND(&seed); 1397 size_t errorDetected = 0; 1398 BYTE* const fullDict = malloc(dictSize); 1399 if (fullDict == NULL) { 1400 return ERROR(GENERIC); 1401 } 1402 1403 /* generate random dictionary */ 1404 if (genRandomDict(dictID, seed, dictSize, fullDict)) { /* return 0 on success */ 1405 errorDetected = ERROR(GENERIC); 1406 goto dictTestCleanup; 1407 } 1408 1409 1410 { frame_t fr; 1411 dictInfo info; 1412 ZSTD_DCtx* const dctx = ZSTD_createDCtx(); 1413 size_t ret; 1414 1415 /* get dict info */ 1416 { size_t const headerSize = MAX(dictSize/4, 256); 1417 size_t const dictContentSize = dictSize-headerSize; 1418 BYTE* const dictContent = fullDict+headerSize; 1419 info = initDictInfo(1, dictContentSize, dictContent, dictID); 1420 } 1421 1422 /* manually decompress and check difference */ 1423 if (genType == gt_frame) { 1424 /* Test frame */ 1425 generateFrame(seed, &fr, info); 1426 ret = ZSTD_decompress_usingDict(dctx, DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE, 1427 fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, 1428 fullDict, dictSize); 1429 } else { 1430 /* Test block */ 1431 generateCompressedBlock(seed, &fr, info); 1432 ret = ZSTD_decompressBegin_usingDict(dctx, fullDict, dictSize); 1433 if (ZSTD_isError(ret)) { 1434 errorDetected = ret; 1435 ZSTD_freeDCtx(dctx); 1436 goto dictTestCleanup; 1437 } 1438 ret = ZSTD_decompressBlock_deprecated(dctx, DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE, 1439 fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart); 1440 } 1441 ZSTD_freeDCtx(dctx); 1442 1443 if (ZSTD_isError(ret)) { 1444 errorDetected = ret; 1445 goto dictTestCleanup; 1446 } 1447 1448 if (memcmp(DECOMPRESSED_BUFFER, fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart) != 0) { 1449 errorDetected = ERROR(corruption_detected); 1450 goto dictTestCleanup; 1451 } 1452 } 1453 1454 dictTestCleanup: 1455 free(fullDict); 1456 return errorDetected; 1457 } 1458 1459 static size_t testDecodeRawBlock(frame_t* fr) 1460 { 1461 ZSTD_DCtx* dctx = ZSTD_createDCtx(); 1462 size_t ret = ZSTD_decompressBegin(dctx); 1463 if (ZSTD_isError(ret)) return ret; 1464 1465 ret = ZSTD_decompressBlock_deprecated( 1466 dctx, 1467 DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE, 1468 fr->dataStart, (BYTE*)fr->data - (BYTE*)fr->dataStart); 1469 ZSTD_freeDCtx(dctx); 1470 if (ZSTD_isError(ret)) return ret; 1471 1472 if (memcmp(DECOMPRESSED_BUFFER, fr->srcStart, 1473 (BYTE*)fr->src - (BYTE*)fr->srcStart) != 0) { 1474 return ERROR(corruption_detected); 1475 } 1476 1477 return ret; 1478 } 1479 1480 static int runBlockTest(U32* seed) 1481 { 1482 frame_t fr; 1483 U32 const seedCopy = *seed; 1484 { dictInfo const info = initDictInfo(0, 0, NULL, 0); 1485 *seed = generateCompressedBlock(*seed, &fr, info); 1486 } 1487 1488 { size_t const r = testDecodeRawBlock(&fr); 1489 if (ZSTD_isError(r)) { 1490 DISPLAY("Error in block mode on test seed %u: %s\n", 1491 (unsigned)seedCopy, ZSTD_getErrorName(r)); 1492 return 1; 1493 } 1494 } 1495 1496 { size_t const r = testDecodeWithDict(*seed, gt_block); 1497 if (ZSTD_isError(r)) { 1498 DISPLAY("Error in block mode with dictionary on test seed %u: %s\n", 1499 (unsigned)seedCopy, ZSTD_getErrorName(r)); 1500 return 1; 1501 } 1502 } 1503 return 0; 1504 } 1505 1506 static int runFrameTest(U32* seed) 1507 { 1508 frame_t fr; 1509 U32 const seedCopy = *seed; 1510 { dictInfo const info = initDictInfo(0, 0, NULL, 0); 1511 *seed = generateFrame(*seed, &fr, info); 1512 } 1513 1514 { size_t const r = testDecodeSimple(&fr); 1515 if (ZSTD_isError(r)) { 1516 DISPLAY("Error in simple mode on test seed %u: %s\n", 1517 (unsigned)seedCopy, ZSTD_getErrorName(r)); 1518 return 1; 1519 } 1520 } 1521 { size_t const r = testDecodeStreaming(&fr); 1522 if (ZSTD_isError(r)) { 1523 DISPLAY("Error in streaming mode on test seed %u: %s\n", 1524 (unsigned)seedCopy, ZSTD_getErrorName(r)); 1525 return 1; 1526 } 1527 } 1528 { size_t const r = testDecodeWithDict(*seed, gt_frame); /* avoid big dictionaries */ 1529 if (ZSTD_isError(r)) { 1530 DISPLAY("Error in dictionary mode on test seed %u: %s\n", 1531 (unsigned)seedCopy, ZSTD_getErrorName(r)); 1532 return 1; 1533 } 1534 } 1535 return 0; 1536 } 1537 1538 static int runTestMode(U32 seed, unsigned numFiles, unsigned const testDurationS, 1539 genType_e genType) 1540 { 1541 unsigned fnum; 1542 1543 UTIL_time_t const startClock = UTIL_getTime(); 1544 U64 const maxClockSpan = testDurationS * SEC_TO_MICRO; 1545 1546 if (numFiles == 0 && !testDurationS) numFiles = 1; 1547 1548 DISPLAY("seed: %u\n", (unsigned)seed); 1549 1550 for (fnum = 0; fnum < numFiles || UTIL_clockSpanMicro(startClock) < maxClockSpan; fnum++) { 1551 if (fnum < numFiles) 1552 DISPLAYUPDATE("\r%u/%u ", fnum, numFiles); 1553 else 1554 DISPLAYUPDATE("\r%u ", fnum); 1555 1556 { int const ret = (genType == gt_frame) ? 1557 runFrameTest(&seed) : 1558 runBlockTest(&seed); 1559 if (ret) return ret; 1560 } 1561 } 1562 1563 DISPLAY("\r%u tests completed: ", fnum); 1564 DISPLAY("OK\n"); 1565 1566 return 0; 1567 } 1568 1569 /*-******************************************************* 1570 * File I/O 1571 *********************************************************/ 1572 1573 static int generateFile(U32 seed, const char* const path, 1574 const char* const origPath, genType_e genType) 1575 { 1576 frame_t fr; 1577 1578 DISPLAY("seed: %u\n", (unsigned)seed); 1579 1580 { dictInfo const info = initDictInfo(0, 0, NULL, 0); 1581 if (genType == gt_frame) { 1582 generateFrame(seed, &fr, info); 1583 } else { 1584 generateCompressedBlock(seed, &fr, info); 1585 } 1586 } 1587 outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, path); 1588 if (origPath) { 1589 outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, origPath); 1590 } 1591 return 0; 1592 } 1593 1594 static int generateCorpus(U32 seed, unsigned numFiles, const char* const path, 1595 const char* const origPath, genType_e genType) 1596 { 1597 char outPath[MAX_PATH]; 1598 unsigned fnum; 1599 1600 DISPLAY("seed: %u\n", (unsigned)seed); 1601 1602 for (fnum = 0; fnum < numFiles; fnum++) { 1603 frame_t fr; 1604 1605 DISPLAYUPDATE("\r%u/%u ", fnum, numFiles); 1606 1607 { dictInfo const info = initDictInfo(0, 0, NULL, 0); 1608 if (genType == gt_frame) { 1609 seed = generateFrame(seed, &fr, info); 1610 } else { 1611 seed = generateCompressedBlock(seed, &fr, info); 1612 } 1613 } 1614 1615 if (snprintf(outPath, MAX_PATH, "%s/z%06u.zst", path, fnum) + 1 > MAX_PATH) { 1616 DISPLAY("Error: path too long\n"); 1617 return 1; 1618 } 1619 outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, outPath); 1620 1621 if (origPath) { 1622 if (snprintf(outPath, MAX_PATH, "%s/z%06u", origPath, fnum) + 1 > MAX_PATH) { 1623 DISPLAY("Error: path too long\n"); 1624 return 1; 1625 } 1626 outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, outPath); 1627 } 1628 } 1629 1630 DISPLAY("\r%u/%u \n", fnum, numFiles); 1631 1632 return 0; 1633 } 1634 1635 static int generateCorpusWithDict(U32 seed, unsigned numFiles, const char* const path, 1636 const char* const origPath, const size_t dictSize, 1637 genType_e genType) 1638 { 1639 char outPath[MAX_PATH]; 1640 BYTE* fullDict; 1641 U32 const dictID = RAND(&seed); 1642 int errorDetected = 0; 1643 1644 if (snprintf(outPath, MAX_PATH, "%s/dictionary", path) + 1 > MAX_PATH) { 1645 DISPLAY("Error: path too long\n"); 1646 return 1; 1647 } 1648 1649 /* allocate space for the dictionary */ 1650 fullDict = malloc(dictSize); 1651 if (fullDict == NULL) { 1652 DISPLAY("Error: could not allocate space for full dictionary.\n"); 1653 return 1; 1654 } 1655 1656 /* randomly generate the dictionary */ 1657 { int const ret = genRandomDict(dictID, seed, dictSize, fullDict); 1658 if (ret != 0) { 1659 errorDetected = ret; 1660 goto dictCleanup; 1661 } 1662 } 1663 1664 /* write out dictionary */ 1665 if (numFiles != 0) { 1666 if (snprintf(outPath, MAX_PATH, "%s/dictionary", path) + 1 > MAX_PATH) { 1667 DISPLAY("Error: dictionary path too long\n"); 1668 errorDetected = 1; 1669 goto dictCleanup; 1670 } 1671 outputBuffer(fullDict, dictSize, outPath); 1672 } 1673 else { 1674 outputBuffer(fullDict, dictSize, "dictionary"); 1675 } 1676 1677 /* generate random compressed/decompressed files */ 1678 { unsigned fnum; 1679 for (fnum = 0; fnum < MAX(numFiles, 1); fnum++) { 1680 frame_t fr; 1681 DISPLAYUPDATE("\r%u/%u ", fnum, numFiles); 1682 { 1683 size_t const headerSize = MAX(dictSize/4, 256); 1684 size_t const dictContentSize = dictSize-headerSize; 1685 BYTE* const dictContent = fullDict+headerSize; 1686 dictInfo const info = initDictInfo(1, dictContentSize, dictContent, dictID); 1687 if (genType == gt_frame) { 1688 seed = generateFrame(seed, &fr, info); 1689 } else { 1690 seed = generateCompressedBlock(seed, &fr, info); 1691 } 1692 } 1693 1694 if (numFiles != 0) { 1695 if (snprintf(outPath, MAX_PATH, "%s/z%06u.zst", path, fnum) + 1 > MAX_PATH) { 1696 DISPLAY("Error: path too long\n"); 1697 errorDetected = 1; 1698 goto dictCleanup; 1699 } 1700 outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, outPath); 1701 1702 if (origPath) { 1703 if (snprintf(outPath, MAX_PATH, "%s/z%06u", origPath, fnum) + 1 > MAX_PATH) { 1704 DISPLAY("Error: path too long\n"); 1705 errorDetected = 1; 1706 goto dictCleanup; 1707 } 1708 outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, outPath); 1709 } 1710 } 1711 else { 1712 outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, path); 1713 if (origPath) { 1714 outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, origPath); 1715 } 1716 } 1717 } 1718 } 1719 1720 dictCleanup: 1721 free(fullDict); 1722 return errorDetected; 1723 } 1724 1725 1726 /*_******************************************************* 1727 * Command line 1728 *********************************************************/ 1729 static U32 makeSeed(void) 1730 { 1731 U32 t = (U32) time(NULL); 1732 return XXH32(&t, sizeof(t), 0) % 65536; 1733 } 1734 1735 static unsigned readInt(const char** argument) 1736 { 1737 unsigned val = 0; 1738 while ((**argument>='0') && (**argument<='9')) { 1739 val *= 10; 1740 val += **argument - '0'; 1741 (*argument)++; 1742 } 1743 return val; 1744 } 1745 1746 static void usage(const char* programName) 1747 { 1748 DISPLAY( "Usage :\n"); 1749 DISPLAY( " %s [args]\n", programName); 1750 DISPLAY( "\n"); 1751 DISPLAY( "Arguments :\n"); 1752 DISPLAY( " -p<path> : select output path (default:stdout)\n"); 1753 DISPLAY( " in multiple files mode this should be a directory\n"); 1754 DISPLAY( " -o<path> : select path to output original file (default:no output)\n"); 1755 DISPLAY( " in multiple files mode this should be a directory\n"); 1756 DISPLAY( " -s# : select seed (default:random based on time)\n"); 1757 DISPLAY( " -n# : number of files to generate (default:1)\n"); 1758 DISPLAY( " -t : activate test mode (test files against libzstd instead of outputting them)\n"); 1759 DISPLAY( " -T# : length of time to run tests for\n"); 1760 DISPLAY( " -v : increase verbosity level (default:0, max:7)\n"); 1761 DISPLAY( " -h/H : display help/long help and exit\n"); 1762 } 1763 1764 static void advancedUsage(const char* programName) 1765 { 1766 usage(programName); 1767 DISPLAY( "\n"); 1768 DISPLAY( "Advanced arguments :\n"); 1769 DISPLAY( " --content-size : always include the content size in the frame header\n"); 1770 DISPLAY( " --use-dict=# : include a dictionary used to decompress the corpus\n"); 1771 DISPLAY( " --gen-blocks : generate raw compressed blocks without block/frame headers\n"); 1772 DISPLAY( " --max-block-size-log=# : max block size log, must be in range [2, 17]\n"); 1773 DISPLAY( " --max-content-size-log=# : max content size log, must be <= 20\n"); 1774 DISPLAY( " (this is ignored with gen-blocks)\n"); 1775 } 1776 1777 /*! readU32FromChar() : 1778 @return : unsigned integer value read from input in `char` format 1779 allows and interprets K, KB, KiB, M, MB and MiB suffix. 1780 Will also modify `*stringPtr`, advancing it to position where it stopped reading. 1781 Note : function result can overflow if digit string > MAX_UINT */ 1782 static unsigned readU32FromChar(const char** stringPtr) 1783 { 1784 unsigned result = 0; 1785 while ((**stringPtr >='0') && (**stringPtr <='9')) 1786 result *= 10, result += **stringPtr - '0', (*stringPtr)++ ; 1787 if ((**stringPtr=='K') || (**stringPtr=='M')) { 1788 result <<= 10; 1789 if (**stringPtr=='M') result <<= 10; 1790 (*stringPtr)++ ; 1791 if (**stringPtr=='i') (*stringPtr)++; 1792 if (**stringPtr=='B') (*stringPtr)++; 1793 } 1794 return result; 1795 } 1796 1797 /** longCommandWArg() : 1798 * check if *stringPtr is the same as longCommand. 1799 * If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand. 1800 * @return 0 and doesn't modify *stringPtr otherwise. 1801 */ 1802 static unsigned longCommandWArg(const char** stringPtr, const char* longCommand) 1803 { 1804 size_t const comSize = strlen(longCommand); 1805 int const result = !strncmp(*stringPtr, longCommand, comSize); 1806 if (result) *stringPtr += comSize; 1807 return result; 1808 } 1809 1810 int main(int argc, char** argv) 1811 { 1812 U32 seed = 0; 1813 int seedset = 0; 1814 unsigned numFiles = 0; 1815 unsigned testDuration = 0; 1816 int testMode = 0; 1817 const char* path = NULL; 1818 const char* origPath = NULL; 1819 int useDict = 0; 1820 unsigned dictSize = (10 << 10); /* 10 kB default */ 1821 genType_e genType = gt_frame; 1822 1823 int argNb; 1824 1825 /* Check command line */ 1826 for (argNb=1; argNb<argc; argNb++) { 1827 const char* argument = argv[argNb]; 1828 if(!argument) continue; /* Protection if argument empty */ 1829 1830 /* Handle commands. Aggregated commands are allowed */ 1831 if (argument[0]=='-') { 1832 argument++; 1833 while (*argument!=0) { 1834 switch(*argument) 1835 { 1836 case 'h': 1837 usage(argv[0]); 1838 return 0; 1839 case 'H': 1840 advancedUsage(argv[0]); 1841 return 0; 1842 case 'v': 1843 argument++; 1844 g_displayLevel++; 1845 break; 1846 case 's': 1847 argument++; 1848 seedset=1; 1849 seed = readInt(&argument); 1850 break; 1851 case 'n': 1852 argument++; 1853 numFiles = readInt(&argument); 1854 break; 1855 case 'T': 1856 argument++; 1857 testDuration = readInt(&argument); 1858 if (*argument == 'm') { 1859 testDuration *= 60; 1860 argument++; 1861 if (*argument == 'n') argument++; 1862 } 1863 break; 1864 case 'o': 1865 argument++; 1866 origPath = argument; 1867 argument += strlen(argument); 1868 break; 1869 case 'p': 1870 argument++; 1871 path = argument; 1872 argument += strlen(argument); 1873 break; 1874 case 't': 1875 argument++; 1876 testMode = 1; 1877 break; 1878 case '-': 1879 argument++; 1880 if (strcmp(argument, "content-size") == 0) { 1881 opts.contentSize = 1; 1882 } else if (longCommandWArg(&argument, "use-dict=")) { 1883 dictSize = readU32FromChar(&argument); 1884 useDict = 1; 1885 } else if (strcmp(argument, "gen-blocks") == 0) { 1886 genType = gt_block; 1887 } else if (longCommandWArg(&argument, "max-block-size-log=")) { 1888 U32 value = readU32FromChar(&argument); 1889 if (value >= 2 && value <= ZSTD_BLOCKSIZE_MAX) { 1890 g_maxBlockSize = 1U << value; 1891 } 1892 } else if (longCommandWArg(&argument, "max-content-size-log=")) { 1893 U32 value = readU32FromChar(&argument); 1894 g_maxDecompressedSizeLog = 1895 MIN(MAX_DECOMPRESSED_SIZE_LOG, value); 1896 } else { 1897 advancedUsage(argv[0]); 1898 return 1; 1899 } 1900 argument += strlen(argument); 1901 break; 1902 default: 1903 usage(argv[0]); 1904 return 1; 1905 } } } } /* for (argNb=1; argNb<argc; argNb++) */ 1906 1907 if (!seedset) { 1908 seed = makeSeed(); 1909 } 1910 1911 if (testMode) { 1912 return runTestMode(seed, numFiles, testDuration, genType); 1913 } else { 1914 if (testDuration) { 1915 DISPLAY("Error: -T requires test mode (-t)\n\n"); 1916 usage(argv[0]); 1917 return 1; 1918 } 1919 } 1920 1921 if (!path) { 1922 DISPLAY("Error: path is required in file generation mode\n"); 1923 usage(argv[0]); 1924 return 1; 1925 } 1926 1927 if (numFiles == 0 && useDict == 0) { 1928 return generateFile(seed, path, origPath, genType); 1929 } else if (useDict == 0){ 1930 return generateCorpus(seed, numFiles, path, origPath, genType); 1931 } else { 1932 /* should generate files with a dictionary */ 1933 return generateCorpusWithDict(seed, numFiles, path, origPath, dictSize, genType); 1934 } 1935 1936 } 1937