1*3117ece4Schristos /* 2*3117ece4Schristos * Copyright (c) Meta Platforms, Inc. and affiliates. 3*3117ece4Schristos * All rights reserved. 4*3117ece4Schristos * 5*3117ece4Schristos * This source code is licensed under both the BSD-style license (found in the 6*3117ece4Schristos * LICENSE file in the root directory of this source tree) and the GPLv2 (found 7*3117ece4Schristos * in the COPYING file in the root directory of this source tree). 8*3117ece4Schristos * You may select, at your option, one of the above-listed licenses. 9*3117ece4Schristos */ 10*3117ece4Schristos 11*3117ece4Schristos #ifndef ZDICT_STATIC_LINKING_ONLY 12*3117ece4Schristos # define ZDICT_STATIC_LINKING_ONLY 13*3117ece4Schristos #endif 14*3117ece4Schristos 15*3117ece4Schristos #include "../common/threading.h" /* ZSTD_pthread_mutex_t */ 16*3117ece4Schristos #include "../common/mem.h" /* U32, BYTE */ 17*3117ece4Schristos #include "../zdict.h" 18*3117ece4Schristos 19*3117ece4Schristos /** 20*3117ece4Schristos * COVER_best_t is used for two purposes: 21*3117ece4Schristos * 1. Synchronizing threads. 22*3117ece4Schristos * 2. Saving the best parameters and dictionary. 23*3117ece4Schristos * 24*3117ece4Schristos * All of the methods except COVER_best_init() are thread safe if zstd is 25*3117ece4Schristos * compiled with multithreaded support. 26*3117ece4Schristos */ 27*3117ece4Schristos typedef struct COVER_best_s { 28*3117ece4Schristos ZSTD_pthread_mutex_t mutex; 29*3117ece4Schristos ZSTD_pthread_cond_t cond; 30*3117ece4Schristos size_t liveJobs; 31*3117ece4Schristos void *dict; 32*3117ece4Schristos size_t dictSize; 33*3117ece4Schristos ZDICT_cover_params_t parameters; 34*3117ece4Schristos size_t compressedSize; 35*3117ece4Schristos } COVER_best_t; 36*3117ece4Schristos 37*3117ece4Schristos /** 38*3117ece4Schristos * A segment is a range in the source as well as the score of the segment. 39*3117ece4Schristos */ 40*3117ece4Schristos typedef struct { 41*3117ece4Schristos U32 begin; 42*3117ece4Schristos U32 end; 43*3117ece4Schristos U32 score; 44*3117ece4Schristos } COVER_segment_t; 45*3117ece4Schristos 46*3117ece4Schristos /** 47*3117ece4Schristos *Number of epochs and size of each epoch. 48*3117ece4Schristos */ 49*3117ece4Schristos typedef struct { 50*3117ece4Schristos U32 num; 51*3117ece4Schristos U32 size; 52*3117ece4Schristos } COVER_epoch_info_t; 53*3117ece4Schristos 54*3117ece4Schristos /** 55*3117ece4Schristos * Struct used for the dictionary selection function. 56*3117ece4Schristos */ 57*3117ece4Schristos typedef struct COVER_dictSelection { 58*3117ece4Schristos BYTE* dictContent; 59*3117ece4Schristos size_t dictSize; 60*3117ece4Schristos size_t totalCompressedSize; 61*3117ece4Schristos } COVER_dictSelection_t; 62*3117ece4Schristos 63*3117ece4Schristos /** 64*3117ece4Schristos * Computes the number of epochs and the size of each epoch. 65*3117ece4Schristos * We will make sure that each epoch gets at least 10 * k bytes. 66*3117ece4Schristos * 67*3117ece4Schristos * The COVER algorithms divide the data up into epochs of equal size and 68*3117ece4Schristos * select one segment from each epoch. 69*3117ece4Schristos * 70*3117ece4Schristos * @param maxDictSize The maximum allowed dictionary size. 71*3117ece4Schristos * @param nbDmers The number of dmers we are training on. 72*3117ece4Schristos * @param k The parameter k (segment size). 73*3117ece4Schristos * @param passes The target number of passes over the dmer corpus. 74*3117ece4Schristos * More passes means a better dictionary. 75*3117ece4Schristos */ 76*3117ece4Schristos COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers, 77*3117ece4Schristos U32 k, U32 passes); 78*3117ece4Schristos 79*3117ece4Schristos /** 80*3117ece4Schristos * Warns the user when their corpus is too small. 81*3117ece4Schristos */ 82*3117ece4Schristos void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel); 83*3117ece4Schristos 84*3117ece4Schristos /** 85*3117ece4Schristos * Checks total compressed size of a dictionary 86*3117ece4Schristos */ 87*3117ece4Schristos size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters, 88*3117ece4Schristos const size_t *samplesSizes, const BYTE *samples, 89*3117ece4Schristos size_t *offsets, 90*3117ece4Schristos size_t nbTrainSamples, size_t nbSamples, 91*3117ece4Schristos BYTE *const dict, size_t dictBufferCapacity); 92*3117ece4Schristos 93*3117ece4Schristos /** 94*3117ece4Schristos * Returns the sum of the sample sizes. 95*3117ece4Schristos */ 96*3117ece4Schristos size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ; 97*3117ece4Schristos 98*3117ece4Schristos /** 99*3117ece4Schristos * Initialize the `COVER_best_t`. 100*3117ece4Schristos */ 101*3117ece4Schristos void COVER_best_init(COVER_best_t *best); 102*3117ece4Schristos 103*3117ece4Schristos /** 104*3117ece4Schristos * Wait until liveJobs == 0. 105*3117ece4Schristos */ 106*3117ece4Schristos void COVER_best_wait(COVER_best_t *best); 107*3117ece4Schristos 108*3117ece4Schristos /** 109*3117ece4Schristos * Call COVER_best_wait() and then destroy the COVER_best_t. 110*3117ece4Schristos */ 111*3117ece4Schristos void COVER_best_destroy(COVER_best_t *best); 112*3117ece4Schristos 113*3117ece4Schristos /** 114*3117ece4Schristos * Called when a thread is about to be launched. 115*3117ece4Schristos * Increments liveJobs. 116*3117ece4Schristos */ 117*3117ece4Schristos void COVER_best_start(COVER_best_t *best); 118*3117ece4Schristos 119*3117ece4Schristos /** 120*3117ece4Schristos * Called when a thread finishes executing, both on error or success. 121*3117ece4Schristos * Decrements liveJobs and signals any waiting threads if liveJobs == 0. 122*3117ece4Schristos * If this dictionary is the best so far save it and its parameters. 123*3117ece4Schristos */ 124*3117ece4Schristos void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters, 125*3117ece4Schristos COVER_dictSelection_t selection); 126*3117ece4Schristos /** 127*3117ece4Schristos * Error function for COVER_selectDict function. Checks if the return 128*3117ece4Schristos * value is an error. 129*3117ece4Schristos */ 130*3117ece4Schristos unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection); 131*3117ece4Schristos 132*3117ece4Schristos /** 133*3117ece4Schristos * Error function for COVER_selectDict function. Returns a struct where 134*3117ece4Schristos * return.totalCompressedSize is a ZSTD error. 135*3117ece4Schristos */ 136*3117ece4Schristos COVER_dictSelection_t COVER_dictSelectionError(size_t error); 137*3117ece4Schristos 138*3117ece4Schristos /** 139*3117ece4Schristos * Always call after selectDict is called to free up used memory from 140*3117ece4Schristos * newly created dictionary. 141*3117ece4Schristos */ 142*3117ece4Schristos void COVER_dictSelectionFree(COVER_dictSelection_t selection); 143*3117ece4Schristos 144*3117ece4Schristos /** 145*3117ece4Schristos * Called to finalize the dictionary and select one based on whether or not 146*3117ece4Schristos * the shrink-dict flag was enabled. If enabled the dictionary used is the 147*3117ece4Schristos * smallest dictionary within a specified regression of the compressed size 148*3117ece4Schristos * from the largest dictionary. 149*3117ece4Schristos */ 150*3117ece4Schristos COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity, 151*3117ece4Schristos size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples, 152*3117ece4Schristos size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize); 153