xref: /netbsd-src/external/bsd/zstd/dist/lib/dictBuilder/cover.h (revision 3117ece4fc4a4ca4489ba793710b60b0d26bab6c)
1*3117ece4Schristos /*
2*3117ece4Schristos  * Copyright (c) Meta Platforms, Inc. and affiliates.
3*3117ece4Schristos  * All rights reserved.
4*3117ece4Schristos  *
5*3117ece4Schristos  * This source code is licensed under both the BSD-style license (found in the
6*3117ece4Schristos  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7*3117ece4Schristos  * in the COPYING file in the root directory of this source tree).
8*3117ece4Schristos  * You may select, at your option, one of the above-listed licenses.
9*3117ece4Schristos  */
10*3117ece4Schristos 
11*3117ece4Schristos #ifndef ZDICT_STATIC_LINKING_ONLY
12*3117ece4Schristos #  define ZDICT_STATIC_LINKING_ONLY
13*3117ece4Schristos #endif
14*3117ece4Schristos 
15*3117ece4Schristos #include "../common/threading.h" /* ZSTD_pthread_mutex_t */
16*3117ece4Schristos #include "../common/mem.h"   /* U32, BYTE */
17*3117ece4Schristos #include "../zdict.h"
18*3117ece4Schristos 
19*3117ece4Schristos /**
20*3117ece4Schristos  * COVER_best_t is used for two purposes:
21*3117ece4Schristos  * 1. Synchronizing threads.
22*3117ece4Schristos  * 2. Saving the best parameters and dictionary.
23*3117ece4Schristos  *
24*3117ece4Schristos  * All of the methods except COVER_best_init() are thread safe if zstd is
25*3117ece4Schristos  * compiled with multithreaded support.
26*3117ece4Schristos  */
27*3117ece4Schristos typedef struct COVER_best_s {
28*3117ece4Schristos   ZSTD_pthread_mutex_t mutex;
29*3117ece4Schristos   ZSTD_pthread_cond_t cond;
30*3117ece4Schristos   size_t liveJobs;
31*3117ece4Schristos   void *dict;
32*3117ece4Schristos   size_t dictSize;
33*3117ece4Schristos   ZDICT_cover_params_t parameters;
34*3117ece4Schristos   size_t compressedSize;
35*3117ece4Schristos } COVER_best_t;
36*3117ece4Schristos 
37*3117ece4Schristos /**
38*3117ece4Schristos  * A segment is a range in the source as well as the score of the segment.
39*3117ece4Schristos  */
40*3117ece4Schristos typedef struct {
41*3117ece4Schristos   U32 begin;
42*3117ece4Schristos   U32 end;
43*3117ece4Schristos   U32 score;
44*3117ece4Schristos } COVER_segment_t;
45*3117ece4Schristos 
46*3117ece4Schristos /**
47*3117ece4Schristos  *Number of epochs and size of each epoch.
48*3117ece4Schristos  */
49*3117ece4Schristos typedef struct {
50*3117ece4Schristos   U32 num;
51*3117ece4Schristos   U32 size;
52*3117ece4Schristos } COVER_epoch_info_t;
53*3117ece4Schristos 
54*3117ece4Schristos /**
55*3117ece4Schristos  * Struct used for the dictionary selection function.
56*3117ece4Schristos  */
57*3117ece4Schristos typedef struct COVER_dictSelection {
58*3117ece4Schristos   BYTE* dictContent;
59*3117ece4Schristos   size_t dictSize;
60*3117ece4Schristos   size_t totalCompressedSize;
61*3117ece4Schristos } COVER_dictSelection_t;
62*3117ece4Schristos 
63*3117ece4Schristos /**
64*3117ece4Schristos  * Computes the number of epochs and the size of each epoch.
65*3117ece4Schristos  * We will make sure that each epoch gets at least 10 * k bytes.
66*3117ece4Schristos  *
67*3117ece4Schristos  * The COVER algorithms divide the data up into epochs of equal size and
68*3117ece4Schristos  * select one segment from each epoch.
69*3117ece4Schristos  *
70*3117ece4Schristos  * @param maxDictSize The maximum allowed dictionary size.
71*3117ece4Schristos  * @param nbDmers     The number of dmers we are training on.
72*3117ece4Schristos  * @param k           The parameter k (segment size).
73*3117ece4Schristos  * @param passes      The target number of passes over the dmer corpus.
74*3117ece4Schristos  *                    More passes means a better dictionary.
75*3117ece4Schristos  */
76*3117ece4Schristos COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,
77*3117ece4Schristos                                        U32 k, U32 passes);
78*3117ece4Schristos 
79*3117ece4Schristos /**
80*3117ece4Schristos  * Warns the user when their corpus is too small.
81*3117ece4Schristos  */
82*3117ece4Schristos void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel);
83*3117ece4Schristos 
84*3117ece4Schristos /**
85*3117ece4Schristos  *  Checks total compressed size of a dictionary
86*3117ece4Schristos  */
87*3117ece4Schristos size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
88*3117ece4Schristos                                       const size_t *samplesSizes, const BYTE *samples,
89*3117ece4Schristos                                       size_t *offsets,
90*3117ece4Schristos                                       size_t nbTrainSamples, size_t nbSamples,
91*3117ece4Schristos                                       BYTE *const dict, size_t dictBufferCapacity);
92*3117ece4Schristos 
93*3117ece4Schristos /**
94*3117ece4Schristos  * Returns the sum of the sample sizes.
95*3117ece4Schristos  */
96*3117ece4Schristos size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ;
97*3117ece4Schristos 
98*3117ece4Schristos /**
99*3117ece4Schristos  * Initialize the `COVER_best_t`.
100*3117ece4Schristos  */
101*3117ece4Schristos void COVER_best_init(COVER_best_t *best);
102*3117ece4Schristos 
103*3117ece4Schristos /**
104*3117ece4Schristos  * Wait until liveJobs == 0.
105*3117ece4Schristos  */
106*3117ece4Schristos void COVER_best_wait(COVER_best_t *best);
107*3117ece4Schristos 
108*3117ece4Schristos /**
109*3117ece4Schristos  * Call COVER_best_wait() and then destroy the COVER_best_t.
110*3117ece4Schristos  */
111*3117ece4Schristos void COVER_best_destroy(COVER_best_t *best);
112*3117ece4Schristos 
113*3117ece4Schristos /**
114*3117ece4Schristos  * Called when a thread is about to be launched.
115*3117ece4Schristos  * Increments liveJobs.
116*3117ece4Schristos  */
117*3117ece4Schristos void COVER_best_start(COVER_best_t *best);
118*3117ece4Schristos 
119*3117ece4Schristos /**
120*3117ece4Schristos  * Called when a thread finishes executing, both on error or success.
121*3117ece4Schristos  * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
122*3117ece4Schristos  * If this dictionary is the best so far save it and its parameters.
123*3117ece4Schristos  */
124*3117ece4Schristos void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
125*3117ece4Schristos                        COVER_dictSelection_t selection);
126*3117ece4Schristos /**
127*3117ece4Schristos  * Error function for COVER_selectDict function. Checks if the return
128*3117ece4Schristos  * value is an error.
129*3117ece4Schristos  */
130*3117ece4Schristos unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection);
131*3117ece4Schristos 
132*3117ece4Schristos  /**
133*3117ece4Schristos   * Error function for COVER_selectDict function. Returns a struct where
134*3117ece4Schristos   * return.totalCompressedSize is a ZSTD error.
135*3117ece4Schristos   */
136*3117ece4Schristos COVER_dictSelection_t COVER_dictSelectionError(size_t error);
137*3117ece4Schristos 
138*3117ece4Schristos /**
139*3117ece4Schristos  * Always call after selectDict is called to free up used memory from
140*3117ece4Schristos  * newly created dictionary.
141*3117ece4Schristos  */
142*3117ece4Schristos void COVER_dictSelectionFree(COVER_dictSelection_t selection);
143*3117ece4Schristos 
144*3117ece4Schristos /**
145*3117ece4Schristos  * Called to finalize the dictionary and select one based on whether or not
146*3117ece4Schristos  * the shrink-dict flag was enabled. If enabled the dictionary used is the
147*3117ece4Schristos  * smallest dictionary within a specified regression of the compressed size
148*3117ece4Schristos  * from the largest dictionary.
149*3117ece4Schristos  */
150*3117ece4Schristos  COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity,
151*3117ece4Schristos                        size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
152*3117ece4Schristos                        size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);
153