1 /* 2 * Copyright (c) Meta Platforms, Inc. and affiliates. 3 * All rights reserved. 4 * 5 * This source code is licensed under both the BSD-style license (found in the 6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found 7 * in the COPYING file in the root directory of this source tree). 8 * You may select, at your option, one of the above-listed licenses. 9 */ 10 11 12 /*-************************************** 13 * Tuning parameters 14 ****************************************/ 15 #define MINRATIO 4 /* minimum nb of apparition to be selected in dictionary */ 16 #define ZDICT_MAX_SAMPLES_SIZE (2000U << 20) 17 #define ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO) 18 19 20 /*-************************************** 21 * Compiler Options 22 ****************************************/ 23 /* Unix Large Files support (>4GB) */ 24 #define _FILE_OFFSET_BITS 64 25 #if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */ 26 # ifndef _LARGEFILE_SOURCE 27 # define _LARGEFILE_SOURCE 28 # endif 29 #elif ! defined(__LP64__) /* No point defining Large file for 64 bit */ 30 # ifndef _LARGEFILE64_SOURCE 31 # define _LARGEFILE64_SOURCE 32 # endif 33 #endif 34 35 36 /*-************************************* 37 * Dependencies 38 ***************************************/ 39 #include <stdlib.h> /* malloc, free */ 40 #include <string.h> /* memset */ 41 #include <stdio.h> /* fprintf, fopen, ftello64 */ 42 #include <time.h> /* clock */ 43 44 #ifndef ZDICT_STATIC_LINKING_ONLY 45 # define ZDICT_STATIC_LINKING_ONLY 46 #endif 47 48 #include "../common/mem.h" /* read */ 49 #include "../common/fse.h" /* FSE_normalizeCount, FSE_writeNCount */ 50 #include "../common/huf.h" /* HUF_buildCTable, HUF_writeCTable */ 51 #include "../common/zstd_internal.h" /* includes zstd.h */ 52 #include "../common/xxhash.h" /* XXH64 */ 53 #include "../compress/zstd_compress_internal.h" /* ZSTD_loadCEntropy() */ 54 #include "../zdict.h" 55 #include "divsufsort.h" 56 #include "../common/bits.h" /* ZSTD_NbCommonBytes */ 57 58 59 /*-************************************* 60 * Constants 61 ***************************************/ 62 #define KB *(1 <<10) 63 #define MB *(1 <<20) 64 #define GB *(1U<<30) 65 66 #define DICTLISTSIZE_DEFAULT 10000 67 68 #define NOISELENGTH 32 69 70 static const U32 g_selectivity_default = 9; 71 72 73 /*-************************************* 74 * Console display 75 ***************************************/ 76 #undef DISPLAY 77 #define DISPLAY(...) do { fprintf(stderr, __VA_ARGS__); fflush( stderr ); } while (0) 78 #undef DISPLAYLEVEL 79 #define DISPLAYLEVEL(l, ...) do { if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } } while (0) /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */ 80 81 static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; } 82 83 static void ZDICT_printHex(const void* ptr, size_t length) 84 { 85 const BYTE* const b = (const BYTE*)ptr; 86 size_t u; 87 for (u=0; u<length; u++) { 88 BYTE c = b[u]; 89 if (c<32 || c>126) c = '.'; /* non-printable char */ 90 DISPLAY("%c", c); 91 } 92 } 93 94 95 /*-******************************************************** 96 * Helper functions 97 **********************************************************/ 98 unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); } 99 100 const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); } 101 102 unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize) 103 { 104 if (dictSize < 8) return 0; 105 if (MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return 0; 106 return MEM_readLE32((const char*)dictBuffer + 4); 107 } 108 109 size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize) 110 { 111 size_t headerSize; 112 if (dictSize <= 8 || MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return ERROR(dictionary_corrupted); 113 114 { ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t)); 115 U32* wksp = (U32*)malloc(HUF_WORKSPACE_SIZE); 116 if (!bs || !wksp) { 117 headerSize = ERROR(memory_allocation); 118 } else { 119 ZSTD_reset_compressedBlockState(bs); 120 headerSize = ZSTD_loadCEntropy(bs, wksp, dictBuffer, dictSize); 121 } 122 123 free(bs); 124 free(wksp); 125 } 126 127 return headerSize; 128 } 129 130 /*-******************************************************** 131 * Dictionary training functions 132 **********************************************************/ 133 /*! ZDICT_count() : 134 Count the nb of common bytes between 2 pointers. 135 Note : this function presumes end of buffer followed by noisy guard band. 136 */ 137 static size_t ZDICT_count(const void* pIn, const void* pMatch) 138 { 139 const char* const pStart = (const char*)pIn; 140 for (;;) { 141 size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn); 142 if (!diff) { 143 pIn = (const char*)pIn+sizeof(size_t); 144 pMatch = (const char*)pMatch+sizeof(size_t); 145 continue; 146 } 147 pIn = (const char*)pIn+ZSTD_NbCommonBytes(diff); 148 return (size_t)((const char*)pIn - pStart); 149 } 150 } 151 152 153 typedef struct { 154 U32 pos; 155 U32 length; 156 U32 savings; 157 } dictItem; 158 159 static void ZDICT_initDictItem(dictItem* d) 160 { 161 d->pos = 1; 162 d->length = 0; 163 d->savings = (U32)(-1); 164 } 165 166 167 #define LLIMIT 64 /* heuristic determined experimentally */ 168 #define MINMATCHLENGTH 7 /* heuristic determined experimentally */ 169 static dictItem ZDICT_analyzePos( 170 BYTE* doneMarks, 171 const int* suffix, U32 start, 172 const void* buffer, U32 minRatio, U32 notificationLevel) 173 { 174 U32 lengthList[LLIMIT] = {0}; 175 U32 cumulLength[LLIMIT] = {0}; 176 U32 savings[LLIMIT] = {0}; 177 const BYTE* b = (const BYTE*)buffer; 178 size_t maxLength = LLIMIT; 179 size_t pos = (size_t)suffix[start]; 180 U32 end = start; 181 dictItem solution; 182 183 /* init */ 184 memset(&solution, 0, sizeof(solution)); 185 doneMarks[pos] = 1; 186 187 /* trivial repetition cases */ 188 if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2)) 189 ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3)) 190 ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) { 191 /* skip and mark segment */ 192 U16 const pattern16 = MEM_read16(b+pos+4); 193 U32 u, patternEnd = 6; 194 while (MEM_read16(b+pos+patternEnd) == pattern16) patternEnd+=2 ; 195 if (b[pos+patternEnd] == b[pos+patternEnd-1]) patternEnd++; 196 for (u=1; u<patternEnd; u++) 197 doneMarks[pos+u] = 1; 198 return solution; 199 } 200 201 /* look forward */ 202 { size_t length; 203 do { 204 end++; 205 length = ZDICT_count(b + pos, b + suffix[end]); 206 } while (length >= MINMATCHLENGTH); 207 } 208 209 /* look backward */ 210 { size_t length; 211 do { 212 length = ZDICT_count(b + pos, b + *(suffix+start-1)); 213 if (length >=MINMATCHLENGTH) start--; 214 } while(length >= MINMATCHLENGTH); 215 } 216 217 /* exit if not found a minimum nb of repetitions */ 218 if (end-start < minRatio) { 219 U32 idx; 220 for(idx=start; idx<end; idx++) 221 doneMarks[suffix[idx]] = 1; 222 return solution; 223 } 224 225 { int i; 226 U32 mml; 227 U32 refinedStart = start; 228 U32 refinedEnd = end; 229 230 DISPLAYLEVEL(4, "\n"); 231 DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (unsigned)(end-start), MINMATCHLENGTH, (unsigned)pos); 232 DISPLAYLEVEL(4, "\n"); 233 234 for (mml = MINMATCHLENGTH ; ; mml++) { 235 BYTE currentChar = 0; 236 U32 currentCount = 0; 237 U32 currentID = refinedStart; 238 U32 id; 239 U32 selectedCount = 0; 240 U32 selectedID = currentID; 241 for (id =refinedStart; id < refinedEnd; id++) { 242 if (b[suffix[id] + mml] != currentChar) { 243 if (currentCount > selectedCount) { 244 selectedCount = currentCount; 245 selectedID = currentID; 246 } 247 currentID = id; 248 currentChar = b[ suffix[id] + mml]; 249 currentCount = 0; 250 } 251 currentCount ++; 252 } 253 if (currentCount > selectedCount) { /* for last */ 254 selectedCount = currentCount; 255 selectedID = currentID; 256 } 257 258 if (selectedCount < minRatio) 259 break; 260 refinedStart = selectedID; 261 refinedEnd = refinedStart + selectedCount; 262 } 263 264 /* evaluate gain based on new dict */ 265 start = refinedStart; 266 pos = suffix[refinedStart]; 267 end = start; 268 memset(lengthList, 0, sizeof(lengthList)); 269 270 /* look forward */ 271 { size_t length; 272 do { 273 end++; 274 length = ZDICT_count(b + pos, b + suffix[end]); 275 if (length >= LLIMIT) length = LLIMIT-1; 276 lengthList[length]++; 277 } while (length >=MINMATCHLENGTH); 278 } 279 280 /* look backward */ 281 { size_t length = MINMATCHLENGTH; 282 while ((length >= MINMATCHLENGTH) & (start > 0)) { 283 length = ZDICT_count(b + pos, b + suffix[start - 1]); 284 if (length >= LLIMIT) length = LLIMIT - 1; 285 lengthList[length]++; 286 if (length >= MINMATCHLENGTH) start--; 287 } 288 } 289 290 /* largest useful length */ 291 memset(cumulLength, 0, sizeof(cumulLength)); 292 cumulLength[maxLength-1] = lengthList[maxLength-1]; 293 for (i=(int)(maxLength-2); i>=0; i--) 294 cumulLength[i] = cumulLength[i+1] + lengthList[i]; 295 296 for (i=LLIMIT-1; i>=MINMATCHLENGTH; i--) if (cumulLength[i]>=minRatio) break; 297 maxLength = i; 298 299 /* reduce maxLength in case of final into repetitive data */ 300 { U32 l = (U32)maxLength; 301 BYTE const c = b[pos + maxLength-1]; 302 while (b[pos+l-2]==c) l--; 303 maxLength = l; 304 } 305 if (maxLength < MINMATCHLENGTH) return solution; /* skip : no long-enough solution */ 306 307 /* calculate savings */ 308 savings[5] = 0; 309 for (i=MINMATCHLENGTH; i<=(int)maxLength; i++) 310 savings[i] = savings[i-1] + (lengthList[i] * (i-3)); 311 312 DISPLAYLEVEL(4, "Selected dict at position %u, of length %u : saves %u (ratio: %.2f) \n", 313 (unsigned)pos, (unsigned)maxLength, (unsigned)savings[maxLength], (double)savings[maxLength] / (double)maxLength); 314 315 solution.pos = (U32)pos; 316 solution.length = (U32)maxLength; 317 solution.savings = savings[maxLength]; 318 319 /* mark positions done */ 320 { U32 id; 321 for (id=start; id<end; id++) { 322 U32 p, pEnd, length; 323 U32 const testedPos = (U32)suffix[id]; 324 if (testedPos == pos) 325 length = solution.length; 326 else { 327 length = (U32)ZDICT_count(b+pos, b+testedPos); 328 if (length > solution.length) length = solution.length; 329 } 330 pEnd = (U32)(testedPos + length); 331 for (p=testedPos; p<pEnd; p++) 332 doneMarks[p] = 1; 333 } } } 334 335 return solution; 336 } 337 338 339 static int isIncluded(const void* in, const void* container, size_t length) 340 { 341 const char* const ip = (const char*) in; 342 const char* const into = (const char*) container; 343 size_t u; 344 345 for (u=0; u<length; u++) { /* works because end of buffer is a noisy guard band */ 346 if (ip[u] != into[u]) break; 347 } 348 349 return u==length; 350 } 351 352 /*! ZDICT_tryMerge() : 353 check if dictItem can be merged, do it if possible 354 @return : id of destination elt, 0 if not merged 355 */ 356 static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const void* buffer) 357 { 358 const U32 tableSize = table->pos; 359 const U32 eltEnd = elt.pos + elt.length; 360 const char* const buf = (const char*) buffer; 361 362 /* tail overlap */ 363 U32 u; for (u=1; u<tableSize; u++) { 364 if (u==eltNbToSkip) continue; 365 if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) { /* overlap, existing > new */ 366 /* append */ 367 U32 const addedLength = table[u].pos - elt.pos; 368 table[u].length += addedLength; 369 table[u].pos = elt.pos; 370 table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */ 371 table[u].savings += elt.length / 8; /* rough approx bonus */ 372 elt = table[u]; 373 /* sort : improve rank */ 374 while ((u>1) && (table[u-1].savings < elt.savings)) 375 table[u] = table[u-1], u--; 376 table[u] = elt; 377 return u; 378 } } 379 380 /* front overlap */ 381 for (u=1; u<tableSize; u++) { 382 if (u==eltNbToSkip) continue; 383 384 if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) { /* overlap, existing < new */ 385 /* append */ 386 int const addedLength = (int)eltEnd - (int)(table[u].pos + table[u].length); 387 table[u].savings += elt.length / 8; /* rough approx bonus */ 388 if (addedLength > 0) { /* otherwise, elt fully included into existing */ 389 table[u].length += addedLength; 390 table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */ 391 } 392 /* sort : improve rank */ 393 elt = table[u]; 394 while ((u>1) && (table[u-1].savings < elt.savings)) 395 table[u] = table[u-1], u--; 396 table[u] = elt; 397 return u; 398 } 399 400 if (MEM_read64(buf + table[u].pos) == MEM_read64(buf + elt.pos + 1)) { 401 if (isIncluded(buf + table[u].pos, buf + elt.pos + 1, table[u].length)) { 402 size_t const addedLength = MAX( (int)elt.length - (int)table[u].length , 1 ); 403 table[u].pos = elt.pos; 404 table[u].savings += (U32)(elt.savings * addedLength / elt.length); 405 table[u].length = MIN(elt.length, table[u].length + 1); 406 return u; 407 } 408 } 409 } 410 411 return 0; 412 } 413 414 415 static void ZDICT_removeDictItem(dictItem* table, U32 id) 416 { 417 /* convention : table[0].pos stores nb of elts */ 418 U32 const max = table[0].pos; 419 U32 u; 420 if (!id) return; /* protection, should never happen */ 421 for (u=id; u<max-1; u++) 422 table[u] = table[u+1]; 423 table->pos--; 424 } 425 426 427 static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt, const void* buffer) 428 { 429 /* merge if possible */ 430 U32 mergeId = ZDICT_tryMerge(table, elt, 0, buffer); 431 if (mergeId) { 432 U32 newMerge = 1; 433 while (newMerge) { 434 newMerge = ZDICT_tryMerge(table, table[mergeId], mergeId, buffer); 435 if (newMerge) ZDICT_removeDictItem(table, mergeId); 436 mergeId = newMerge; 437 } 438 return; 439 } 440 441 /* insert */ 442 { U32 current; 443 U32 nextElt = table->pos; 444 if (nextElt >= maxSize) nextElt = maxSize-1; 445 current = nextElt-1; 446 while (table[current].savings < elt.savings) { 447 table[current+1] = table[current]; 448 current--; 449 } 450 table[current+1] = elt; 451 table->pos = nextElt+1; 452 } 453 } 454 455 456 static U32 ZDICT_dictSize(const dictItem* dictList) 457 { 458 U32 u, dictSize = 0; 459 for (u=1; u<dictList[0].pos; u++) 460 dictSize += dictList[u].length; 461 return dictSize; 462 } 463 464 465 static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize, 466 const void* const buffer, size_t bufferSize, /* buffer must end with noisy guard band */ 467 const size_t* fileSizes, unsigned nbFiles, 468 unsigned minRatio, U32 notificationLevel) 469 { 470 int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0)); 471 int* const suffix = suffix0+1; 472 U32* reverseSuffix = (U32*)malloc((bufferSize)*sizeof(*reverseSuffix)); 473 BYTE* doneMarks = (BYTE*)malloc((bufferSize+16)*sizeof(*doneMarks)); /* +16 for overflow security */ 474 U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos)); 475 size_t result = 0; 476 clock_t displayClock = 0; 477 clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10; 478 479 # undef DISPLAYUPDATE 480 # define DISPLAYUPDATE(l, ...) \ 481 do { \ 482 if (notificationLevel>=l) { \ 483 if (ZDICT_clockSpan(displayClock) > refreshRate) { \ 484 displayClock = clock(); \ 485 DISPLAY(__VA_ARGS__); \ 486 } \ 487 if (notificationLevel>=4) fflush(stderr); \ 488 } \ 489 } while (0) 490 491 /* init */ 492 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */ 493 if (!suffix0 || !reverseSuffix || !doneMarks || !filePos) { 494 result = ERROR(memory_allocation); 495 goto _cleanup; 496 } 497 if (minRatio < MINRATIO) minRatio = MINRATIO; 498 memset(doneMarks, 0, bufferSize+16); 499 500 /* limit sample set size (divsufsort limitation)*/ 501 if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (unsigned)(ZDICT_MAX_SAMPLES_SIZE>>20)); 502 while (bufferSize > ZDICT_MAX_SAMPLES_SIZE) bufferSize -= fileSizes[--nbFiles]; 503 504 /* sort */ 505 DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (unsigned)(bufferSize>>20)); 506 { int const divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0); 507 if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; } 508 } 509 suffix[bufferSize] = (int)bufferSize; /* leads into noise */ 510 suffix0[0] = (int)bufferSize; /* leads into noise */ 511 /* build reverse suffix sort */ 512 { size_t pos; 513 for (pos=0; pos < bufferSize; pos++) 514 reverseSuffix[suffix[pos]] = (U32)pos; 515 /* note filePos tracks borders between samples. 516 It's not used at this stage, but planned to become useful in a later update */ 517 filePos[0] = 0; 518 for (pos=1; pos<nbFiles; pos++) 519 filePos[pos] = (U32)(filePos[pos-1] + fileSizes[pos-1]); 520 } 521 522 DISPLAYLEVEL(2, "finding patterns ... \n"); 523 DISPLAYLEVEL(3, "minimum ratio : %u \n", minRatio); 524 525 { U32 cursor; for (cursor=0; cursor < bufferSize; ) { 526 dictItem solution; 527 if (doneMarks[cursor]) { cursor++; continue; } 528 solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel); 529 if (solution.length==0) { cursor++; continue; } 530 ZDICT_insertDictItem(dictList, dictListSize, solution, buffer); 531 cursor += solution.length; 532 DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / (double)bufferSize * 100.0); 533 } } 534 535 _cleanup: 536 free(suffix0); 537 free(reverseSuffix); 538 free(doneMarks); 539 free(filePos); 540 return result; 541 } 542 543 544 static void ZDICT_fillNoise(void* buffer, size_t length) 545 { 546 unsigned const prime1 = 2654435761U; 547 unsigned const prime2 = 2246822519U; 548 unsigned acc = prime1; 549 size_t p=0; 550 for (p=0; p<length; p++) { 551 acc *= prime2; 552 ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21); 553 } 554 } 555 556 557 typedef struct 558 { 559 ZSTD_CDict* dict; /* dictionary */ 560 ZSTD_CCtx* zc; /* working context */ 561 void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */ 562 } EStats_ress_t; 563 564 #define MAXREPOFFSET 1024 565 566 static void ZDICT_countEStats(EStats_ress_t esr, const ZSTD_parameters* params, 567 unsigned* countLit, unsigned* offsetcodeCount, unsigned* matchlengthCount, unsigned* litlengthCount, U32* repOffsets, 568 const void* src, size_t srcSize, 569 U32 notificationLevel) 570 { 571 size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params->cParams.windowLog); 572 size_t cSize; 573 574 if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */ 575 { size_t const errorCode = ZSTD_compressBegin_usingCDict_deprecated(esr.zc, esr.dict); 576 if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_compressBegin_usingCDict failed \n"); return; } 577 578 } 579 cSize = ZSTD_compressBlock_deprecated(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize); 580 if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (unsigned)srcSize); return; } 581 582 if (cSize) { /* if == 0; block is not compressible */ 583 const seqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc); 584 585 /* literals stats */ 586 { const BYTE* bytePtr; 587 for(bytePtr = seqStorePtr->litStart; bytePtr < seqStorePtr->lit; bytePtr++) 588 countLit[*bytePtr]++; 589 } 590 591 /* seqStats */ 592 { U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); 593 ZSTD_seqToCodes(seqStorePtr); 594 595 { const BYTE* codePtr = seqStorePtr->ofCode; 596 U32 u; 597 for (u=0; u<nbSeq; u++) offsetcodeCount[codePtr[u]]++; 598 } 599 600 { const BYTE* codePtr = seqStorePtr->mlCode; 601 U32 u; 602 for (u=0; u<nbSeq; u++) matchlengthCount[codePtr[u]]++; 603 } 604 605 { const BYTE* codePtr = seqStorePtr->llCode; 606 U32 u; 607 for (u=0; u<nbSeq; u++) litlengthCount[codePtr[u]]++; 608 } 609 610 if (nbSeq >= 2) { /* rep offsets */ 611 const seqDef* const seq = seqStorePtr->sequencesStart; 612 U32 offset1 = seq[0].offBase - ZSTD_REP_NUM; 613 U32 offset2 = seq[1].offBase - ZSTD_REP_NUM; 614 if (offset1 >= MAXREPOFFSET) offset1 = 0; 615 if (offset2 >= MAXREPOFFSET) offset2 = 0; 616 repOffsets[offset1] += 3; 617 repOffsets[offset2] += 1; 618 } } } 619 } 620 621 static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles) 622 { 623 size_t total=0; 624 unsigned u; 625 for (u=0; u<nbFiles; u++) total += fileSizes[u]; 626 return total; 627 } 628 629 typedef struct { U32 offset; U32 count; } offsetCount_t; 630 631 static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val, U32 count) 632 { 633 U32 u; 634 table[ZSTD_REP_NUM].offset = val; 635 table[ZSTD_REP_NUM].count = count; 636 for (u=ZSTD_REP_NUM; u>0; u--) { 637 offsetCount_t tmp; 638 if (table[u-1].count >= table[u].count) break; 639 tmp = table[u-1]; 640 table[u-1] = table[u]; 641 table[u] = tmp; 642 } 643 } 644 645 /* ZDICT_flatLit() : 646 * rewrite `countLit` to contain a mostly flat but still compressible distribution of literals. 647 * necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode. 648 */ 649 static void ZDICT_flatLit(unsigned* countLit) 650 { 651 int u; 652 for (u=1; u<256; u++) countLit[u] = 2; 653 countLit[0] = 4; 654 countLit[253] = 1; 655 countLit[254] = 1; 656 } 657 658 #define OFFCODE_MAX 30 /* only applicable to first block */ 659 static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, 660 int compressionLevel, 661 const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles, 662 const void* dictBuffer, size_t dictBufferSize, 663 unsigned notificationLevel) 664 { 665 unsigned countLit[256]; 666 HUF_CREATE_STATIC_CTABLE(hufTable, 255); 667 unsigned offcodeCount[OFFCODE_MAX+1]; 668 short offcodeNCount[OFFCODE_MAX+1]; 669 U32 offcodeMax = ZSTD_highbit32((U32)(dictBufferSize + 128 KB)); 670 unsigned matchLengthCount[MaxML+1]; 671 short matchLengthNCount[MaxML+1]; 672 unsigned litLengthCount[MaxLL+1]; 673 short litLengthNCount[MaxLL+1]; 674 U32 repOffset[MAXREPOFFSET]; 675 offsetCount_t bestRepOffset[ZSTD_REP_NUM+1]; 676 EStats_ress_t esr = { NULL, NULL, NULL }; 677 ZSTD_parameters params; 678 U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total; 679 size_t pos = 0, errorCode; 680 size_t eSize = 0; 681 size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles); 682 size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles); 683 BYTE* dstPtr = (BYTE*)dstBuffer; 684 U32 wksp[HUF_CTABLE_WORKSPACE_SIZE_U32]; 685 686 /* init */ 687 DEBUGLOG(4, "ZDICT_analyzeEntropy"); 688 if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; } /* too large dictionary */ 689 for (u=0; u<256; u++) countLit[u] = 1; /* any character must be described */ 690 for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1; 691 for (u=0; u<=MaxML; u++) matchLengthCount[u] = 1; 692 for (u=0; u<=MaxLL; u++) litLengthCount[u] = 1; 693 memset(repOffset, 0, sizeof(repOffset)); 694 repOffset[1] = repOffset[4] = repOffset[8] = 1; 695 memset(bestRepOffset, 0, sizeof(bestRepOffset)); 696 if (compressionLevel==0) compressionLevel = ZSTD_CLEVEL_DEFAULT; 697 params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize); 698 699 esr.dict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, ZSTD_dlm_byRef, ZSTD_dct_rawContent, params.cParams, ZSTD_defaultCMem); 700 esr.zc = ZSTD_createCCtx(); 701 esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX); 702 if (!esr.dict || !esr.zc || !esr.workPlace) { 703 eSize = ERROR(memory_allocation); 704 DISPLAYLEVEL(1, "Not enough memory \n"); 705 goto _cleanup; 706 } 707 708 /* collect stats on all samples */ 709 for (u=0; u<nbFiles; u++) { 710 ZDICT_countEStats(esr, ¶ms, 711 countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset, 712 (const char*)srcBuffer + pos, fileSizes[u], 713 notificationLevel); 714 pos += fileSizes[u]; 715 } 716 717 if (notificationLevel >= 4) { 718 /* writeStats */ 719 DISPLAYLEVEL(4, "Offset Code Frequencies : \n"); 720 for (u=0; u<=offcodeMax; u++) { 721 DISPLAYLEVEL(4, "%2u :%7u \n", u, offcodeCount[u]); 722 } } 723 724 /* analyze, build stats, starting with literals */ 725 { size_t maxNbBits = HUF_buildCTable_wksp(hufTable, countLit, 255, huffLog, wksp, sizeof(wksp)); 726 if (HUF_isError(maxNbBits)) { 727 eSize = maxNbBits; 728 DISPLAYLEVEL(1, " HUF_buildCTable error \n"); 729 goto _cleanup; 730 } 731 if (maxNbBits==8) { /* not compressible : will fail on HUF_writeCTable() */ 732 DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n"); 733 ZDICT_flatLit(countLit); /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */ 734 maxNbBits = HUF_buildCTable_wksp(hufTable, countLit, 255, huffLog, wksp, sizeof(wksp)); 735 assert(maxNbBits==9); 736 } 737 huffLog = (U32)maxNbBits; 738 } 739 740 /* looking for most common first offsets */ 741 { U32 offset; 742 for (offset=1; offset<MAXREPOFFSET; offset++) 743 ZDICT_insertSortCount(bestRepOffset, offset, repOffset[offset]); 744 } 745 /* note : the result of this phase should be used to better appreciate the impact on statistics */ 746 747 total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u]; 748 errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax, /* useLowProbCount */ 1); 749 if (FSE_isError(errorCode)) { 750 eSize = errorCode; 751 DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n"); 752 goto _cleanup; 753 } 754 Offlog = (U32)errorCode; 755 756 total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u]; 757 errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML, /* useLowProbCount */ 1); 758 if (FSE_isError(errorCode)) { 759 eSize = errorCode; 760 DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n"); 761 goto _cleanup; 762 } 763 mlLog = (U32)errorCode; 764 765 total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u]; 766 errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL, /* useLowProbCount */ 1); 767 if (FSE_isError(errorCode)) { 768 eSize = errorCode; 769 DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n"); 770 goto _cleanup; 771 } 772 llLog = (U32)errorCode; 773 774 /* write result to buffer */ 775 { size_t const hhSize = HUF_writeCTable_wksp(dstPtr, maxDstSize, hufTable, 255, huffLog, wksp, sizeof(wksp)); 776 if (HUF_isError(hhSize)) { 777 eSize = hhSize; 778 DISPLAYLEVEL(1, "HUF_writeCTable error \n"); 779 goto _cleanup; 780 } 781 dstPtr += hhSize; 782 maxDstSize -= hhSize; 783 eSize += hhSize; 784 } 785 786 { size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog); 787 if (FSE_isError(ohSize)) { 788 eSize = ohSize; 789 DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount \n"); 790 goto _cleanup; 791 } 792 dstPtr += ohSize; 793 maxDstSize -= ohSize; 794 eSize += ohSize; 795 } 796 797 { size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog); 798 if (FSE_isError(mhSize)) { 799 eSize = mhSize; 800 DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount \n"); 801 goto _cleanup; 802 } 803 dstPtr += mhSize; 804 maxDstSize -= mhSize; 805 eSize += mhSize; 806 } 807 808 { size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog); 809 if (FSE_isError(lhSize)) { 810 eSize = lhSize; 811 DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount \n"); 812 goto _cleanup; 813 } 814 dstPtr += lhSize; 815 maxDstSize -= lhSize; 816 eSize += lhSize; 817 } 818 819 if (maxDstSize<12) { 820 eSize = ERROR(dstSize_tooSmall); 821 DISPLAYLEVEL(1, "not enough space to write RepOffsets \n"); 822 goto _cleanup; 823 } 824 # if 0 825 MEM_writeLE32(dstPtr+0, bestRepOffset[0].offset); 826 MEM_writeLE32(dstPtr+4, bestRepOffset[1].offset); 827 MEM_writeLE32(dstPtr+8, bestRepOffset[2].offset); 828 #else 829 /* at this stage, we don't use the result of "most common first offset", 830 * as the impact of statistics is not properly evaluated */ 831 MEM_writeLE32(dstPtr+0, repStartValue[0]); 832 MEM_writeLE32(dstPtr+4, repStartValue[1]); 833 MEM_writeLE32(dstPtr+8, repStartValue[2]); 834 #endif 835 eSize += 12; 836 837 _cleanup: 838 ZSTD_freeCDict(esr.dict); 839 ZSTD_freeCCtx(esr.zc); 840 free(esr.workPlace); 841 842 return eSize; 843 } 844 845 846 /** 847 * @returns the maximum repcode value 848 */ 849 static U32 ZDICT_maxRep(U32 const reps[ZSTD_REP_NUM]) 850 { 851 U32 maxRep = reps[0]; 852 int r; 853 for (r = 1; r < ZSTD_REP_NUM; ++r) 854 maxRep = MAX(maxRep, reps[r]); 855 return maxRep; 856 } 857 858 size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity, 859 const void* customDictContent, size_t dictContentSize, 860 const void* samplesBuffer, const size_t* samplesSizes, 861 unsigned nbSamples, ZDICT_params_t params) 862 { 863 size_t hSize; 864 #define HBUFFSIZE 256 /* should prove large enough for all entropy headers */ 865 BYTE header[HBUFFSIZE]; 866 int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel; 867 U32 const notificationLevel = params.notificationLevel; 868 /* The final dictionary content must be at least as large as the largest repcode */ 869 size_t const minContentSize = (size_t)ZDICT_maxRep(repStartValue); 870 size_t paddingSize; 871 872 /* check conditions */ 873 DEBUGLOG(4, "ZDICT_finalizeDictionary"); 874 if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall); 875 if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall); 876 877 /* dictionary header */ 878 MEM_writeLE32(header, ZSTD_MAGIC_DICTIONARY); 879 { U64 const randomID = XXH64(customDictContent, dictContentSize, 0); 880 U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768; 881 U32 const dictID = params.dictID ? params.dictID : compliantID; 882 MEM_writeLE32(header+4, dictID); 883 } 884 hSize = 8; 885 886 /* entropy tables */ 887 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */ 888 DISPLAYLEVEL(2, "statistics ... \n"); 889 { size_t const eSize = ZDICT_analyzeEntropy(header+hSize, HBUFFSIZE-hSize, 890 compressionLevel, 891 samplesBuffer, samplesSizes, nbSamples, 892 customDictContent, dictContentSize, 893 notificationLevel); 894 if (ZDICT_isError(eSize)) return eSize; 895 hSize += eSize; 896 } 897 898 /* Shrink the content size if it doesn't fit in the buffer */ 899 if (hSize + dictContentSize > dictBufferCapacity) { 900 dictContentSize = dictBufferCapacity - hSize; 901 } 902 903 /* Pad the dictionary content with zeros if it is too small */ 904 if (dictContentSize < minContentSize) { 905 RETURN_ERROR_IF(hSize + minContentSize > dictBufferCapacity, dstSize_tooSmall, 906 "dictBufferCapacity too small to fit max repcode"); 907 paddingSize = minContentSize - dictContentSize; 908 } else { 909 paddingSize = 0; 910 } 911 912 { 913 size_t const dictSize = hSize + paddingSize + dictContentSize; 914 915 /* The dictionary consists of the header, optional padding, and the content. 916 * The padding comes before the content because the "best" position in the 917 * dictionary is the last byte. 918 */ 919 BYTE* const outDictHeader = (BYTE*)dictBuffer; 920 BYTE* const outDictPadding = outDictHeader + hSize; 921 BYTE* const outDictContent = outDictPadding + paddingSize; 922 923 assert(dictSize <= dictBufferCapacity); 924 assert(outDictContent + dictContentSize == (BYTE*)dictBuffer + dictSize); 925 926 /* First copy the customDictContent into its final location. 927 * `customDictContent` and `dictBuffer` may overlap, so we must 928 * do this before any other writes into the output buffer. 929 * Then copy the header & padding into the output buffer. 930 */ 931 memmove(outDictContent, customDictContent, dictContentSize); 932 memcpy(outDictHeader, header, hSize); 933 memset(outDictPadding, 0, paddingSize); 934 935 return dictSize; 936 } 937 } 938 939 940 static size_t ZDICT_addEntropyTablesFromBuffer_advanced( 941 void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, 942 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, 943 ZDICT_params_t params) 944 { 945 int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel; 946 U32 const notificationLevel = params.notificationLevel; 947 size_t hSize = 8; 948 949 /* calculate entropy tables */ 950 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */ 951 DISPLAYLEVEL(2, "statistics ... \n"); 952 { size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize, 953 compressionLevel, 954 samplesBuffer, samplesSizes, nbSamples, 955 (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 956 notificationLevel); 957 if (ZDICT_isError(eSize)) return eSize; 958 hSize += eSize; 959 } 960 961 /* add dictionary header (after entropy tables) */ 962 MEM_writeLE32(dictBuffer, ZSTD_MAGIC_DICTIONARY); 963 { U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0); 964 U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768; 965 U32 const dictID = params.dictID ? params.dictID : compliantID; 966 MEM_writeLE32((char*)dictBuffer+4, dictID); 967 } 968 969 if (hSize + dictContentSize < dictBufferCapacity) 970 memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize); 971 return MIN(dictBufferCapacity, hSize+dictContentSize); 972 } 973 974 /*! ZDICT_trainFromBuffer_unsafe_legacy() : 975 * Warning : `samplesBuffer` must be followed by noisy guard band !!! 976 * @return : size of dictionary, or an error code which can be tested with ZDICT_isError() 977 */ 978 static size_t ZDICT_trainFromBuffer_unsafe_legacy( 979 void* dictBuffer, size_t maxDictSize, 980 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, 981 ZDICT_legacy_params_t params) 982 { 983 U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16)); 984 dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList)); 985 unsigned const selectivity = params.selectivityLevel == 0 ? g_selectivity_default : params.selectivityLevel; 986 unsigned const minRep = (selectivity > 30) ? MINRATIO : nbSamples >> selectivity; 987 size_t const targetDictSize = maxDictSize; 988 size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples); 989 size_t dictSize = 0; 990 U32 const notificationLevel = params.zParams.notificationLevel; 991 992 /* checks */ 993 if (!dictList) return ERROR(memory_allocation); 994 if (maxDictSize < ZDICT_DICTSIZE_MIN) { free(dictList); return ERROR(dstSize_tooSmall); } /* requested dictionary size is too small */ 995 if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return ERROR(dictionaryCreation_failed); } /* not enough source to create dictionary */ 996 997 /* init */ 998 ZDICT_initDictItem(dictList); 999 1000 /* build dictionary */ 1001 ZDICT_trainBuffer_legacy(dictList, dictListSize, 1002 samplesBuffer, samplesBuffSize, 1003 samplesSizes, nbSamples, 1004 minRep, notificationLevel); 1005 1006 /* display best matches */ 1007 if (params.zParams.notificationLevel>= 3) { 1008 unsigned const nb = MIN(25, dictList[0].pos); 1009 unsigned const dictContentSize = ZDICT_dictSize(dictList); 1010 unsigned u; 1011 DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", (unsigned)dictList[0].pos-1, dictContentSize); 1012 DISPLAYLEVEL(3, "list %u best segments \n", nb-1); 1013 for (u=1; u<nb; u++) { 1014 unsigned const pos = dictList[u].pos; 1015 unsigned const length = dictList[u].length; 1016 U32 const printedLength = MIN(40, length); 1017 if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize)) { 1018 free(dictList); 1019 return ERROR(GENERIC); /* should never happen */ 1020 } 1021 DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |", 1022 u, length, pos, (unsigned)dictList[u].savings); 1023 ZDICT_printHex((const char*)samplesBuffer+pos, printedLength); 1024 DISPLAYLEVEL(3, "| \n"); 1025 } } 1026 1027 1028 /* create dictionary */ 1029 { unsigned dictContentSize = ZDICT_dictSize(dictList); 1030 if (dictContentSize < ZDICT_CONTENTSIZE_MIN) { free(dictList); return ERROR(dictionaryCreation_failed); } /* dictionary content too small */ 1031 if (dictContentSize < targetDictSize/4) { 1032 DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (unsigned)maxDictSize); 1033 if (samplesBuffSize < 10 * targetDictSize) 1034 DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (unsigned)(samplesBuffSize>>20)); 1035 if (minRep > MINRATIO) { 1036 DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1); 1037 DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n"); 1038 } 1039 } 1040 1041 if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) { 1042 unsigned proposedSelectivity = selectivity-1; 1043 while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; } 1044 DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (unsigned)maxDictSize); 1045 DISPLAYLEVEL(2, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity); 1046 DISPLAYLEVEL(2, "! always test dictionary efficiency on real samples \n"); 1047 } 1048 1049 /* limit dictionary size */ 1050 { U32 const max = dictList->pos; /* convention : nb of useful elts within dictList */ 1051 U32 currentSize = 0; 1052 U32 n; for (n=1; n<max; n++) { 1053 currentSize += dictList[n].length; 1054 if (currentSize > targetDictSize) { currentSize -= dictList[n].length; break; } 1055 } 1056 dictList->pos = n; 1057 dictContentSize = currentSize; 1058 } 1059 1060 /* build dict content */ 1061 { U32 u; 1062 BYTE* ptr = (BYTE*)dictBuffer + maxDictSize; 1063 for (u=1; u<dictList->pos; u++) { 1064 U32 l = dictList[u].length; 1065 ptr -= l; 1066 if (ptr<(BYTE*)dictBuffer) { free(dictList); return ERROR(GENERIC); } /* should not happen */ 1067 memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l); 1068 } } 1069 1070 dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize, 1071 samplesBuffer, samplesSizes, nbSamples, 1072 params.zParams); 1073 } 1074 1075 /* clean up */ 1076 free(dictList); 1077 return dictSize; 1078 } 1079 1080 1081 /* ZDICT_trainFromBuffer_legacy() : 1082 * issue : samplesBuffer need to be followed by a noisy guard band. 1083 * work around : duplicate the buffer, and add the noise */ 1084 size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity, 1085 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, 1086 ZDICT_legacy_params_t params) 1087 { 1088 size_t result; 1089 void* newBuff; 1090 size_t const sBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples); 1091 if (sBuffSize < ZDICT_MIN_SAMPLES_SIZE) return 0; /* not enough content => no dictionary */ 1092 1093 newBuff = malloc(sBuffSize + NOISELENGTH); 1094 if (!newBuff) return ERROR(memory_allocation); 1095 1096 memcpy(newBuff, samplesBuffer, sBuffSize); 1097 ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */ 1098 1099 result = 1100 ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, dictBufferCapacity, newBuff, 1101 samplesSizes, nbSamples, params); 1102 free(newBuff); 1103 return result; 1104 } 1105 1106 1107 size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, 1108 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) 1109 { 1110 ZDICT_fastCover_params_t params; 1111 DEBUGLOG(3, "ZDICT_trainFromBuffer"); 1112 memset(¶ms, 0, sizeof(params)); 1113 params.d = 8; 1114 params.steps = 4; 1115 /* Use default level since no compression level information is available */ 1116 params.zParams.compressionLevel = ZSTD_CLEVEL_DEFAULT; 1117 #if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1) 1118 params.zParams.notificationLevel = DEBUGLEVEL; 1119 #endif 1120 return ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, dictBufferCapacity, 1121 samplesBuffer, samplesSizes, nbSamples, 1122 ¶ms); 1123 } 1124 1125 size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, 1126 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) 1127 { 1128 ZDICT_params_t params; 1129 memset(¶ms, 0, sizeof(params)); 1130 return ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, dictBufferCapacity, 1131 samplesBuffer, samplesSizes, nbSamples, 1132 params); 1133 } 1134