1a28cd43dSSascha Wildner /* 2a28cd43dSSascha Wildner * Copyright (c) 2017-2020, Facebook, Inc. 3a28cd43dSSascha Wildner * All rights reserved. 4a28cd43dSSascha Wildner * 5a28cd43dSSascha Wildner * This source code is licensed under both the BSD-style license (found in the 6a28cd43dSSascha Wildner * LICENSE file in the root directory of this source tree) and the GPLv2 (found 7a28cd43dSSascha Wildner * in the COPYING file in the root directory of this source tree). 8a28cd43dSSascha Wildner * You may select, at your option, one of the above-listed licenses. 9a28cd43dSSascha Wildner */ 10a28cd43dSSascha Wildner 11a28cd43dSSascha Wildner #include <stdio.h> /* fprintf */ 12a28cd43dSSascha Wildner #include <stdlib.h> /* malloc, free, qsort */ 13a28cd43dSSascha Wildner #include <string.h> /* memset */ 14a28cd43dSSascha Wildner #include <time.h> /* clock */ 15a28cd43dSSascha Wildner #include "../common/mem.h" /* read */ 16a28cd43dSSascha Wildner #include "../common/pool.h" 17a28cd43dSSascha Wildner #include "../common/threading.h" 18a28cd43dSSascha Wildner #include "../common/zstd_internal.h" /* includes zstd.h */ 19a28cd43dSSascha Wildner #ifndef ZDICT_STATIC_LINKING_ONLY 20a28cd43dSSascha Wildner #define ZDICT_STATIC_LINKING_ONLY 21a28cd43dSSascha Wildner #endif 22a28cd43dSSascha Wildner #include "zdict.h" 23a28cd43dSSascha Wildner 24a28cd43dSSascha Wildner /** 25a28cd43dSSascha Wildner * COVER_best_t is used for two purposes: 26a28cd43dSSascha Wildner * 1. Synchronizing threads. 27a28cd43dSSascha Wildner * 2. Saving the best parameters and dictionary. 28a28cd43dSSascha Wildner * 29a28cd43dSSascha Wildner * All of the methods except COVER_best_init() are thread safe if zstd is 30a28cd43dSSascha Wildner * compiled with multithreaded support. 31a28cd43dSSascha Wildner */ 32a28cd43dSSascha Wildner typedef struct COVER_best_s { 33a28cd43dSSascha Wildner ZSTD_pthread_mutex_t mutex; 34a28cd43dSSascha Wildner ZSTD_pthread_cond_t cond; 35a28cd43dSSascha Wildner size_t liveJobs; 36a28cd43dSSascha Wildner void *dict; 37a28cd43dSSascha Wildner size_t dictSize; 38a28cd43dSSascha Wildner ZDICT_cover_params_t parameters; 39a28cd43dSSascha Wildner size_t compressedSize; 40a28cd43dSSascha Wildner } COVER_best_t; 41a28cd43dSSascha Wildner 42a28cd43dSSascha Wildner /** 43a28cd43dSSascha Wildner * A segment is a range in the source as well as the score of the segment. 44a28cd43dSSascha Wildner */ 45a28cd43dSSascha Wildner typedef struct { 46a28cd43dSSascha Wildner U32 begin; 47a28cd43dSSascha Wildner U32 end; 48a28cd43dSSascha Wildner U32 score; 49a28cd43dSSascha Wildner } COVER_segment_t; 50a28cd43dSSascha Wildner 51a28cd43dSSascha Wildner /** 52a28cd43dSSascha Wildner *Number of epochs and size of each epoch. 53a28cd43dSSascha Wildner */ 54a28cd43dSSascha Wildner typedef struct { 55a28cd43dSSascha Wildner U32 num; 56a28cd43dSSascha Wildner U32 size; 57a28cd43dSSascha Wildner } COVER_epoch_info_t; 58a28cd43dSSascha Wildner 59a28cd43dSSascha Wildner /** 60a28cd43dSSascha Wildner * Struct used for the dictionary selection function. 61a28cd43dSSascha Wildner */ 62a28cd43dSSascha Wildner typedef struct COVER_dictSelection { 63a28cd43dSSascha Wildner BYTE* dictContent; 64a28cd43dSSascha Wildner size_t dictSize; 65a28cd43dSSascha Wildner size_t totalCompressedSize; 66a28cd43dSSascha Wildner } COVER_dictSelection_t; 67a28cd43dSSascha Wildner 68a28cd43dSSascha Wildner /** 69a28cd43dSSascha Wildner * Computes the number of epochs and the size of each epoch. 70a28cd43dSSascha Wildner * We will make sure that each epoch gets at least 10 * k bytes. 71a28cd43dSSascha Wildner * 72a28cd43dSSascha Wildner * The COVER algorithms divide the data up into epochs of equal size and 73a28cd43dSSascha Wildner * select one segment from each epoch. 74a28cd43dSSascha Wildner * 75a28cd43dSSascha Wildner * @param maxDictSize The maximum allowed dictionary size. 76a28cd43dSSascha Wildner * @param nbDmers The number of dmers we are training on. 77a28cd43dSSascha Wildner * @param k The parameter k (segment size). 78a28cd43dSSascha Wildner * @param passes The target number of passes over the dmer corpus. 79a28cd43dSSascha Wildner * More passes means a better dictionary. 80a28cd43dSSascha Wildner */ 81a28cd43dSSascha Wildner COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers, 82a28cd43dSSascha Wildner U32 k, U32 passes); 83a28cd43dSSascha Wildner 84a28cd43dSSascha Wildner /** 85a28cd43dSSascha Wildner * Warns the user when their corpus is too small. 86a28cd43dSSascha Wildner */ 87a28cd43dSSascha Wildner void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel); 88a28cd43dSSascha Wildner 89a28cd43dSSascha Wildner /** 90a28cd43dSSascha Wildner * Checks total compressed size of a dictionary 91a28cd43dSSascha Wildner */ 92a28cd43dSSascha Wildner size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters, 93a28cd43dSSascha Wildner const size_t *samplesSizes, const BYTE *samples, 94a28cd43dSSascha Wildner size_t *offsets, 95a28cd43dSSascha Wildner size_t nbTrainSamples, size_t nbSamples, 96a28cd43dSSascha Wildner BYTE *const dict, size_t dictBufferCapacity); 97a28cd43dSSascha Wildner 98a28cd43dSSascha Wildner /** 99a28cd43dSSascha Wildner * Returns the sum of the sample sizes. 100a28cd43dSSascha Wildner */ 101a28cd43dSSascha Wildner size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ; 102a28cd43dSSascha Wildner 103a28cd43dSSascha Wildner /** 104a28cd43dSSascha Wildner * Initialize the `COVER_best_t`. 105a28cd43dSSascha Wildner */ 106a28cd43dSSascha Wildner void COVER_best_init(COVER_best_t *best); 107a28cd43dSSascha Wildner 108a28cd43dSSascha Wildner /** 109a28cd43dSSascha Wildner * Wait until liveJobs == 0. 110a28cd43dSSascha Wildner */ 111a28cd43dSSascha Wildner void COVER_best_wait(COVER_best_t *best); 112a28cd43dSSascha Wildner 113a28cd43dSSascha Wildner /** 114a28cd43dSSascha Wildner * Call COVER_best_wait() and then destroy the COVER_best_t. 115a28cd43dSSascha Wildner */ 116a28cd43dSSascha Wildner void COVER_best_destroy(COVER_best_t *best); 117a28cd43dSSascha Wildner 118a28cd43dSSascha Wildner /** 119a28cd43dSSascha Wildner * Called when a thread is about to be launched. 120a28cd43dSSascha Wildner * Increments liveJobs. 121a28cd43dSSascha Wildner */ 122a28cd43dSSascha Wildner void COVER_best_start(COVER_best_t *best); 123a28cd43dSSascha Wildner 124a28cd43dSSascha Wildner /** 125a28cd43dSSascha Wildner * Called when a thread finishes executing, both on error or success. 126a28cd43dSSascha Wildner * Decrements liveJobs and signals any waiting threads if liveJobs == 0. 127a28cd43dSSascha Wildner * If this dictionary is the best so far save it and its parameters. 128a28cd43dSSascha Wildner */ 129a28cd43dSSascha Wildner void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters, 130a28cd43dSSascha Wildner COVER_dictSelection_t selection); 131a28cd43dSSascha Wildner /** 132a28cd43dSSascha Wildner * Error function for COVER_selectDict function. Checks if the return 133a28cd43dSSascha Wildner * value is an error. 134a28cd43dSSascha Wildner */ 135a28cd43dSSascha Wildner unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection); 136a28cd43dSSascha Wildner 137a28cd43dSSascha Wildner /** 138a28cd43dSSascha Wildner * Error function for COVER_selectDict function. Returns a struct where 139a28cd43dSSascha Wildner * return.totalCompressedSize is a ZSTD error. 140a28cd43dSSascha Wildner */ 141a28cd43dSSascha Wildner COVER_dictSelection_t COVER_dictSelectionError(size_t error); 142a28cd43dSSascha Wildner 143a28cd43dSSascha Wildner /** 144a28cd43dSSascha Wildner * Always call after selectDict is called to free up used memory from 145a28cd43dSSascha Wildner * newly created dictionary. 146a28cd43dSSascha Wildner */ 147a28cd43dSSascha Wildner void COVER_dictSelectionFree(COVER_dictSelection_t selection); 148a28cd43dSSascha Wildner 149a28cd43dSSascha Wildner /** 150a28cd43dSSascha Wildner * Called to finalize the dictionary and select one based on whether or not 151a28cd43dSSascha Wildner * the shrink-dict flag was enabled. If enabled the dictionary used is the 152a28cd43dSSascha Wildner * smallest dictionary within a specified regression of the compressed size 153a28cd43dSSascha Wildner * from the largest dictionary. 154a28cd43dSSascha Wildner */ 155a28cd43dSSascha Wildner COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity, 156a28cd43dSSascha Wildner size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples, 157a28cd43dSSascha Wildner size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize); 158