1*a28cd43dSSascha Wildner /* 2*a28cd43dSSascha Wildner * Copyright (c) 2017-2020, Facebook, Inc. 3*a28cd43dSSascha Wildner * All rights reserved. 4*a28cd43dSSascha Wildner * 5*a28cd43dSSascha Wildner * This source code is licensed under both the BSD-style license (found in the 6*a28cd43dSSascha Wildner * LICENSE file in the root directory of this source tree) and the GPLv2 (found 7*a28cd43dSSascha Wildner * in the COPYING file in the root directory of this source tree). 8*a28cd43dSSascha Wildner * You may select, at your option, one of the above-listed licenses. 9*a28cd43dSSascha Wildner */ 10*a28cd43dSSascha Wildner 11*a28cd43dSSascha Wildner #include <stdio.h> /* fprintf */ 12*a28cd43dSSascha Wildner #include <stdlib.h> /* malloc, free, qsort */ 13*a28cd43dSSascha Wildner #include <string.h> /* memset */ 14*a28cd43dSSascha Wildner #include <time.h> /* clock */ 15*a28cd43dSSascha Wildner #include "../common/mem.h" /* read */ 16*a28cd43dSSascha Wildner #include "../common/pool.h" 17*a28cd43dSSascha Wildner #include "../common/threading.h" 18*a28cd43dSSascha Wildner #include "../common/zstd_internal.h" /* includes zstd.h */ 19*a28cd43dSSascha Wildner #ifndef ZDICT_STATIC_LINKING_ONLY 20*a28cd43dSSascha Wildner #define ZDICT_STATIC_LINKING_ONLY 21*a28cd43dSSascha Wildner #endif 22*a28cd43dSSascha Wildner #include "zdict.h" 23*a28cd43dSSascha Wildner 24*a28cd43dSSascha Wildner /** 25*a28cd43dSSascha Wildner * COVER_best_t is used for two purposes: 26*a28cd43dSSascha Wildner * 1. Synchronizing threads. 27*a28cd43dSSascha Wildner * 2. Saving the best parameters and dictionary. 28*a28cd43dSSascha Wildner * 29*a28cd43dSSascha Wildner * All of the methods except COVER_best_init() are thread safe if zstd is 30*a28cd43dSSascha Wildner * compiled with multithreaded support. 31*a28cd43dSSascha Wildner */ 32*a28cd43dSSascha Wildner typedef struct COVER_best_s { 33*a28cd43dSSascha Wildner ZSTD_pthread_mutex_t mutex; 34*a28cd43dSSascha Wildner ZSTD_pthread_cond_t cond; 35*a28cd43dSSascha Wildner size_t liveJobs; 36*a28cd43dSSascha Wildner void *dict; 37*a28cd43dSSascha Wildner size_t dictSize; 38*a28cd43dSSascha Wildner ZDICT_cover_params_t parameters; 39*a28cd43dSSascha Wildner size_t compressedSize; 40*a28cd43dSSascha Wildner } COVER_best_t; 41*a28cd43dSSascha Wildner 42*a28cd43dSSascha Wildner /** 43*a28cd43dSSascha Wildner * A segment is a range in the source as well as the score of the segment. 44*a28cd43dSSascha Wildner */ 45*a28cd43dSSascha Wildner typedef struct { 46*a28cd43dSSascha Wildner U32 begin; 47*a28cd43dSSascha Wildner U32 end; 48*a28cd43dSSascha Wildner U32 score; 49*a28cd43dSSascha Wildner } COVER_segment_t; 50*a28cd43dSSascha Wildner 51*a28cd43dSSascha Wildner /** 52*a28cd43dSSascha Wildner *Number of epochs and size of each epoch. 53*a28cd43dSSascha Wildner */ 54*a28cd43dSSascha Wildner typedef struct { 55*a28cd43dSSascha Wildner U32 num; 56*a28cd43dSSascha Wildner U32 size; 57*a28cd43dSSascha Wildner } COVER_epoch_info_t; 58*a28cd43dSSascha Wildner 59*a28cd43dSSascha Wildner /** 60*a28cd43dSSascha Wildner * Struct used for the dictionary selection function. 61*a28cd43dSSascha Wildner */ 62*a28cd43dSSascha Wildner typedef struct COVER_dictSelection { 63*a28cd43dSSascha Wildner BYTE* dictContent; 64*a28cd43dSSascha Wildner size_t dictSize; 65*a28cd43dSSascha Wildner size_t totalCompressedSize; 66*a28cd43dSSascha Wildner } COVER_dictSelection_t; 67*a28cd43dSSascha Wildner 68*a28cd43dSSascha Wildner /** 69*a28cd43dSSascha Wildner * Computes the number of epochs and the size of each epoch. 70*a28cd43dSSascha Wildner * We will make sure that each epoch gets at least 10 * k bytes. 71*a28cd43dSSascha Wildner * 72*a28cd43dSSascha Wildner * The COVER algorithms divide the data up into epochs of equal size and 73*a28cd43dSSascha Wildner * select one segment from each epoch. 74*a28cd43dSSascha Wildner * 75*a28cd43dSSascha Wildner * @param maxDictSize The maximum allowed dictionary size. 76*a28cd43dSSascha Wildner * @param nbDmers The number of dmers we are training on. 77*a28cd43dSSascha Wildner * @param k The parameter k (segment size). 78*a28cd43dSSascha Wildner * @param passes The target number of passes over the dmer corpus. 79*a28cd43dSSascha Wildner * More passes means a better dictionary. 80*a28cd43dSSascha Wildner */ 81*a28cd43dSSascha Wildner COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers, 82*a28cd43dSSascha Wildner U32 k, U32 passes); 83*a28cd43dSSascha Wildner 84*a28cd43dSSascha Wildner /** 85*a28cd43dSSascha Wildner * Warns the user when their corpus is too small. 86*a28cd43dSSascha Wildner */ 87*a28cd43dSSascha Wildner void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel); 88*a28cd43dSSascha Wildner 89*a28cd43dSSascha Wildner /** 90*a28cd43dSSascha Wildner * Checks total compressed size of a dictionary 91*a28cd43dSSascha Wildner */ 92*a28cd43dSSascha Wildner size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters, 93*a28cd43dSSascha Wildner const size_t *samplesSizes, const BYTE *samples, 94*a28cd43dSSascha Wildner size_t *offsets, 95*a28cd43dSSascha Wildner size_t nbTrainSamples, size_t nbSamples, 96*a28cd43dSSascha Wildner BYTE *const dict, size_t dictBufferCapacity); 97*a28cd43dSSascha Wildner 98*a28cd43dSSascha Wildner /** 99*a28cd43dSSascha Wildner * Returns the sum of the sample sizes. 100*a28cd43dSSascha Wildner */ 101*a28cd43dSSascha Wildner size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ; 102*a28cd43dSSascha Wildner 103*a28cd43dSSascha Wildner /** 104*a28cd43dSSascha Wildner * Initialize the `COVER_best_t`. 105*a28cd43dSSascha Wildner */ 106*a28cd43dSSascha Wildner void COVER_best_init(COVER_best_t *best); 107*a28cd43dSSascha Wildner 108*a28cd43dSSascha Wildner /** 109*a28cd43dSSascha Wildner * Wait until liveJobs == 0. 110*a28cd43dSSascha Wildner */ 111*a28cd43dSSascha Wildner void COVER_best_wait(COVER_best_t *best); 112*a28cd43dSSascha Wildner 113*a28cd43dSSascha Wildner /** 114*a28cd43dSSascha Wildner * Call COVER_best_wait() and then destroy the COVER_best_t. 115*a28cd43dSSascha Wildner */ 116*a28cd43dSSascha Wildner void COVER_best_destroy(COVER_best_t *best); 117*a28cd43dSSascha Wildner 118*a28cd43dSSascha Wildner /** 119*a28cd43dSSascha Wildner * Called when a thread is about to be launched. 120*a28cd43dSSascha Wildner * Increments liveJobs. 121*a28cd43dSSascha Wildner */ 122*a28cd43dSSascha Wildner void COVER_best_start(COVER_best_t *best); 123*a28cd43dSSascha Wildner 124*a28cd43dSSascha Wildner /** 125*a28cd43dSSascha Wildner * Called when a thread finishes executing, both on error or success. 126*a28cd43dSSascha Wildner * Decrements liveJobs and signals any waiting threads if liveJobs == 0. 127*a28cd43dSSascha Wildner * If this dictionary is the best so far save it and its parameters. 128*a28cd43dSSascha Wildner */ 129*a28cd43dSSascha Wildner void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters, 130*a28cd43dSSascha Wildner COVER_dictSelection_t selection); 131*a28cd43dSSascha Wildner /** 132*a28cd43dSSascha Wildner * Error function for COVER_selectDict function. Checks if the return 133*a28cd43dSSascha Wildner * value is an error. 134*a28cd43dSSascha Wildner */ 135*a28cd43dSSascha Wildner unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection); 136*a28cd43dSSascha Wildner 137*a28cd43dSSascha Wildner /** 138*a28cd43dSSascha Wildner * Error function for COVER_selectDict function. Returns a struct where 139*a28cd43dSSascha Wildner * return.totalCompressedSize is a ZSTD error. 140*a28cd43dSSascha Wildner */ 141*a28cd43dSSascha Wildner COVER_dictSelection_t COVER_dictSelectionError(size_t error); 142*a28cd43dSSascha Wildner 143*a28cd43dSSascha Wildner /** 144*a28cd43dSSascha Wildner * Always call after selectDict is called to free up used memory from 145*a28cd43dSSascha Wildner * newly created dictionary. 146*a28cd43dSSascha Wildner */ 147*a28cd43dSSascha Wildner void COVER_dictSelectionFree(COVER_dictSelection_t selection); 148*a28cd43dSSascha Wildner 149*a28cd43dSSascha Wildner /** 150*a28cd43dSSascha Wildner * Called to finalize the dictionary and select one based on whether or not 151*a28cd43dSSascha Wildner * the shrink-dict flag was enabled. If enabled the dictionary used is the 152*a28cd43dSSascha Wildner * smallest dictionary within a specified regression of the compressed size 153*a28cd43dSSascha Wildner * from the largest dictionary. 154*a28cd43dSSascha Wildner */ 155*a28cd43dSSascha Wildner COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity, 156*a28cd43dSSascha Wildner size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples, 157*a28cd43dSSascha Wildner size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize); 158