xref: /dflybsd-src/contrib/zstd/lib/dictBuilder/cover.h (revision a28cd43d19e8b720a6c852a4bbc5ae147a26165a)
1*a28cd43dSSascha Wildner /*
2*a28cd43dSSascha Wildner  * Copyright (c) 2017-2020, Facebook, Inc.
3*a28cd43dSSascha Wildner  * All rights reserved.
4*a28cd43dSSascha Wildner  *
5*a28cd43dSSascha Wildner  * This source code is licensed under both the BSD-style license (found in the
6*a28cd43dSSascha Wildner  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7*a28cd43dSSascha Wildner  * in the COPYING file in the root directory of this source tree).
8*a28cd43dSSascha Wildner  * You may select, at your option, one of the above-listed licenses.
9*a28cd43dSSascha Wildner  */
10*a28cd43dSSascha Wildner 
11*a28cd43dSSascha Wildner #include <stdio.h>  /* fprintf */
12*a28cd43dSSascha Wildner #include <stdlib.h> /* malloc, free, qsort */
13*a28cd43dSSascha Wildner #include <string.h> /* memset */
14*a28cd43dSSascha Wildner #include <time.h>   /* clock */
15*a28cd43dSSascha Wildner #include "../common/mem.h" /* read */
16*a28cd43dSSascha Wildner #include "../common/pool.h"
17*a28cd43dSSascha Wildner #include "../common/threading.h"
18*a28cd43dSSascha Wildner #include "../common/zstd_internal.h" /* includes zstd.h */
19*a28cd43dSSascha Wildner #ifndef ZDICT_STATIC_LINKING_ONLY
20*a28cd43dSSascha Wildner #define ZDICT_STATIC_LINKING_ONLY
21*a28cd43dSSascha Wildner #endif
22*a28cd43dSSascha Wildner #include "zdict.h"
23*a28cd43dSSascha Wildner 
24*a28cd43dSSascha Wildner /**
25*a28cd43dSSascha Wildner  * COVER_best_t is used for two purposes:
26*a28cd43dSSascha Wildner  * 1. Synchronizing threads.
27*a28cd43dSSascha Wildner  * 2. Saving the best parameters and dictionary.
28*a28cd43dSSascha Wildner  *
29*a28cd43dSSascha Wildner  * All of the methods except COVER_best_init() are thread safe if zstd is
30*a28cd43dSSascha Wildner  * compiled with multithreaded support.
31*a28cd43dSSascha Wildner  */
32*a28cd43dSSascha Wildner typedef struct COVER_best_s {
33*a28cd43dSSascha Wildner   ZSTD_pthread_mutex_t mutex;
34*a28cd43dSSascha Wildner   ZSTD_pthread_cond_t cond;
35*a28cd43dSSascha Wildner   size_t liveJobs;
36*a28cd43dSSascha Wildner   void *dict;
37*a28cd43dSSascha Wildner   size_t dictSize;
38*a28cd43dSSascha Wildner   ZDICT_cover_params_t parameters;
39*a28cd43dSSascha Wildner   size_t compressedSize;
40*a28cd43dSSascha Wildner } COVER_best_t;
41*a28cd43dSSascha Wildner 
42*a28cd43dSSascha Wildner /**
43*a28cd43dSSascha Wildner  * A segment is a range in the source as well as the score of the segment.
44*a28cd43dSSascha Wildner  */
45*a28cd43dSSascha Wildner typedef struct {
46*a28cd43dSSascha Wildner   U32 begin;
47*a28cd43dSSascha Wildner   U32 end;
48*a28cd43dSSascha Wildner   U32 score;
49*a28cd43dSSascha Wildner } COVER_segment_t;
50*a28cd43dSSascha Wildner 
51*a28cd43dSSascha Wildner /**
52*a28cd43dSSascha Wildner  *Number of epochs and size of each epoch.
53*a28cd43dSSascha Wildner  */
54*a28cd43dSSascha Wildner typedef struct {
55*a28cd43dSSascha Wildner   U32 num;
56*a28cd43dSSascha Wildner   U32 size;
57*a28cd43dSSascha Wildner } COVER_epoch_info_t;
58*a28cd43dSSascha Wildner 
59*a28cd43dSSascha Wildner /**
60*a28cd43dSSascha Wildner  * Struct used for the dictionary selection function.
61*a28cd43dSSascha Wildner  */
62*a28cd43dSSascha Wildner typedef struct COVER_dictSelection {
63*a28cd43dSSascha Wildner   BYTE* dictContent;
64*a28cd43dSSascha Wildner   size_t dictSize;
65*a28cd43dSSascha Wildner   size_t totalCompressedSize;
66*a28cd43dSSascha Wildner } COVER_dictSelection_t;
67*a28cd43dSSascha Wildner 
68*a28cd43dSSascha Wildner /**
69*a28cd43dSSascha Wildner  * Computes the number of epochs and the size of each epoch.
70*a28cd43dSSascha Wildner  * We will make sure that each epoch gets at least 10 * k bytes.
71*a28cd43dSSascha Wildner  *
72*a28cd43dSSascha Wildner  * The COVER algorithms divide the data up into epochs of equal size and
73*a28cd43dSSascha Wildner  * select one segment from each epoch.
74*a28cd43dSSascha Wildner  *
75*a28cd43dSSascha Wildner  * @param maxDictSize The maximum allowed dictionary size.
76*a28cd43dSSascha Wildner  * @param nbDmers     The number of dmers we are training on.
77*a28cd43dSSascha Wildner  * @param k           The parameter k (segment size).
78*a28cd43dSSascha Wildner  * @param passes      The target number of passes over the dmer corpus.
79*a28cd43dSSascha Wildner  *                    More passes means a better dictionary.
80*a28cd43dSSascha Wildner  */
81*a28cd43dSSascha Wildner COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,
82*a28cd43dSSascha Wildner                                        U32 k, U32 passes);
83*a28cd43dSSascha Wildner 
84*a28cd43dSSascha Wildner /**
85*a28cd43dSSascha Wildner  * Warns the user when their corpus is too small.
86*a28cd43dSSascha Wildner  */
87*a28cd43dSSascha Wildner void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel);
88*a28cd43dSSascha Wildner 
89*a28cd43dSSascha Wildner /**
90*a28cd43dSSascha Wildner  *  Checks total compressed size of a dictionary
91*a28cd43dSSascha Wildner  */
92*a28cd43dSSascha Wildner size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
93*a28cd43dSSascha Wildner                                       const size_t *samplesSizes, const BYTE *samples,
94*a28cd43dSSascha Wildner                                       size_t *offsets,
95*a28cd43dSSascha Wildner                                       size_t nbTrainSamples, size_t nbSamples,
96*a28cd43dSSascha Wildner                                       BYTE *const dict, size_t dictBufferCapacity);
97*a28cd43dSSascha Wildner 
98*a28cd43dSSascha Wildner /**
99*a28cd43dSSascha Wildner  * Returns the sum of the sample sizes.
100*a28cd43dSSascha Wildner  */
101*a28cd43dSSascha Wildner size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ;
102*a28cd43dSSascha Wildner 
103*a28cd43dSSascha Wildner /**
104*a28cd43dSSascha Wildner  * Initialize the `COVER_best_t`.
105*a28cd43dSSascha Wildner  */
106*a28cd43dSSascha Wildner void COVER_best_init(COVER_best_t *best);
107*a28cd43dSSascha Wildner 
108*a28cd43dSSascha Wildner /**
109*a28cd43dSSascha Wildner  * Wait until liveJobs == 0.
110*a28cd43dSSascha Wildner  */
111*a28cd43dSSascha Wildner void COVER_best_wait(COVER_best_t *best);
112*a28cd43dSSascha Wildner 
113*a28cd43dSSascha Wildner /**
114*a28cd43dSSascha Wildner  * Call COVER_best_wait() and then destroy the COVER_best_t.
115*a28cd43dSSascha Wildner  */
116*a28cd43dSSascha Wildner void COVER_best_destroy(COVER_best_t *best);
117*a28cd43dSSascha Wildner 
118*a28cd43dSSascha Wildner /**
119*a28cd43dSSascha Wildner  * Called when a thread is about to be launched.
120*a28cd43dSSascha Wildner  * Increments liveJobs.
121*a28cd43dSSascha Wildner  */
122*a28cd43dSSascha Wildner void COVER_best_start(COVER_best_t *best);
123*a28cd43dSSascha Wildner 
124*a28cd43dSSascha Wildner /**
125*a28cd43dSSascha Wildner  * Called when a thread finishes executing, both on error or success.
126*a28cd43dSSascha Wildner  * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
127*a28cd43dSSascha Wildner  * If this dictionary is the best so far save it and its parameters.
128*a28cd43dSSascha Wildner  */
129*a28cd43dSSascha Wildner void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
130*a28cd43dSSascha Wildner                        COVER_dictSelection_t selection);
131*a28cd43dSSascha Wildner /**
132*a28cd43dSSascha Wildner  * Error function for COVER_selectDict function. Checks if the return
133*a28cd43dSSascha Wildner  * value is an error.
134*a28cd43dSSascha Wildner  */
135*a28cd43dSSascha Wildner unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection);
136*a28cd43dSSascha Wildner 
137*a28cd43dSSascha Wildner  /**
138*a28cd43dSSascha Wildner   * Error function for COVER_selectDict function. Returns a struct where
139*a28cd43dSSascha Wildner   * return.totalCompressedSize is a ZSTD error.
140*a28cd43dSSascha Wildner   */
141*a28cd43dSSascha Wildner COVER_dictSelection_t COVER_dictSelectionError(size_t error);
142*a28cd43dSSascha Wildner 
143*a28cd43dSSascha Wildner /**
144*a28cd43dSSascha Wildner  * Always call after selectDict is called to free up used memory from
145*a28cd43dSSascha Wildner  * newly created dictionary.
146*a28cd43dSSascha Wildner  */
147*a28cd43dSSascha Wildner void COVER_dictSelectionFree(COVER_dictSelection_t selection);
148*a28cd43dSSascha Wildner 
149*a28cd43dSSascha Wildner /**
150*a28cd43dSSascha Wildner  * Called to finalize the dictionary and select one based on whether or not
151*a28cd43dSSascha Wildner  * the shrink-dict flag was enabled. If enabled the dictionary used is the
152*a28cd43dSSascha Wildner  * smallest dictionary within a specified regression of the compressed size
153*a28cd43dSSascha Wildner  * from the largest dictionary.
154*a28cd43dSSascha Wildner  */
155*a28cd43dSSascha Wildner  COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity,
156*a28cd43dSSascha Wildner                        size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
157*a28cd43dSSascha Wildner                        size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);
158