1*a28cd43dSSascha Wildner /*
2*a28cd43dSSascha Wildner * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
3*a28cd43dSSascha Wildner * All rights reserved.
4*a28cd43dSSascha Wildner *
5*a28cd43dSSascha Wildner * This source code is licensed under both the BSD-style license (found in the
6*a28cd43dSSascha Wildner * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7*a28cd43dSSascha Wildner * in the COPYING file in the root directory of this source tree).
8*a28cd43dSSascha Wildner * You may select, at your option, one of the above-listed licenses.
9*a28cd43dSSascha Wildner */
10*a28cd43dSSascha Wildner
11*a28cd43dSSascha Wildner /* *****************************************************************************
12*a28cd43dSSascha Wildner * Constructs a dictionary using a heuristic based on the following paper:
13*a28cd43dSSascha Wildner *
14*a28cd43dSSascha Wildner * Liao, Petri, Moffat, Wirth
15*a28cd43dSSascha Wildner * Effective Construction of Relative Lempel-Ziv Dictionaries
16*a28cd43dSSascha Wildner * Published in WWW 2016.
17*a28cd43dSSascha Wildner *
18*a28cd43dSSascha Wildner * Adapted from code originally written by @ot (Giuseppe Ottaviano).
19*a28cd43dSSascha Wildner ******************************************************************************/
20*a28cd43dSSascha Wildner
21*a28cd43dSSascha Wildner /*-*************************************
22*a28cd43dSSascha Wildner * Dependencies
23*a28cd43dSSascha Wildner ***************************************/
24*a28cd43dSSascha Wildner #include <stdio.h> /* fprintf */
25*a28cd43dSSascha Wildner #include <stdlib.h> /* malloc, free, qsort */
26*a28cd43dSSascha Wildner #include <string.h> /* memset */
27*a28cd43dSSascha Wildner #include <time.h> /* clock */
28*a28cd43dSSascha Wildner
29*a28cd43dSSascha Wildner #include "../common/mem.h" /* read */
30*a28cd43dSSascha Wildner #include "../common/pool.h"
31*a28cd43dSSascha Wildner #include "../common/threading.h"
32*a28cd43dSSascha Wildner #include "cover.h"
33*a28cd43dSSascha Wildner #include "../common/zstd_internal.h" /* includes zstd.h */
34*a28cd43dSSascha Wildner #ifndef ZDICT_STATIC_LINKING_ONLY
35*a28cd43dSSascha Wildner #define ZDICT_STATIC_LINKING_ONLY
36*a28cd43dSSascha Wildner #endif
37*a28cd43dSSascha Wildner #include "zdict.h"
38*a28cd43dSSascha Wildner
39*a28cd43dSSascha Wildner /*-*************************************
40*a28cd43dSSascha Wildner * Constants
41*a28cd43dSSascha Wildner ***************************************/
42*a28cd43dSSascha Wildner #define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB))
43*a28cd43dSSascha Wildner #define COVER_DEFAULT_SPLITPOINT 1.0
44*a28cd43dSSascha Wildner
45*a28cd43dSSascha Wildner /*-*************************************
46*a28cd43dSSascha Wildner * Console display
47*a28cd43dSSascha Wildner ***************************************/
48*a28cd43dSSascha Wildner #ifndef LOCALDISPLAYLEVEL
49*a28cd43dSSascha Wildner static int g_displayLevel = 2;
50*a28cd43dSSascha Wildner #endif
51*a28cd43dSSascha Wildner #undef DISPLAY
52*a28cd43dSSascha Wildner #define DISPLAY(...) \
53*a28cd43dSSascha Wildner { \
54*a28cd43dSSascha Wildner fprintf(stderr, __VA_ARGS__); \
55*a28cd43dSSascha Wildner fflush(stderr); \
56*a28cd43dSSascha Wildner }
57*a28cd43dSSascha Wildner #undef LOCALDISPLAYLEVEL
58*a28cd43dSSascha Wildner #define LOCALDISPLAYLEVEL(displayLevel, l, ...) \
59*a28cd43dSSascha Wildner if (displayLevel >= l) { \
60*a28cd43dSSascha Wildner DISPLAY(__VA_ARGS__); \
61*a28cd43dSSascha Wildner } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */
62*a28cd43dSSascha Wildner #undef DISPLAYLEVEL
63*a28cd43dSSascha Wildner #define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__)
64*a28cd43dSSascha Wildner
65*a28cd43dSSascha Wildner #ifndef LOCALDISPLAYUPDATE
66*a28cd43dSSascha Wildner static const clock_t g_refreshRate = CLOCKS_PER_SEC * 15 / 100;
67*a28cd43dSSascha Wildner static clock_t g_time = 0;
68*a28cd43dSSascha Wildner #endif
69*a28cd43dSSascha Wildner #undef LOCALDISPLAYUPDATE
70*a28cd43dSSascha Wildner #define LOCALDISPLAYUPDATE(displayLevel, l, ...) \
71*a28cd43dSSascha Wildner if (displayLevel >= l) { \
72*a28cd43dSSascha Wildner if ((clock() - g_time > g_refreshRate) || (displayLevel >= 4)) { \
73*a28cd43dSSascha Wildner g_time = clock(); \
74*a28cd43dSSascha Wildner DISPLAY(__VA_ARGS__); \
75*a28cd43dSSascha Wildner } \
76*a28cd43dSSascha Wildner }
77*a28cd43dSSascha Wildner #undef DISPLAYUPDATE
78*a28cd43dSSascha Wildner #define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__)
79*a28cd43dSSascha Wildner
80*a28cd43dSSascha Wildner /*-*************************************
81*a28cd43dSSascha Wildner * Hash table
82*a28cd43dSSascha Wildner ***************************************
83*a28cd43dSSascha Wildner * A small specialized hash map for storing activeDmers.
84*a28cd43dSSascha Wildner * The map does not resize, so if it becomes full it will loop forever.
85*a28cd43dSSascha Wildner * Thus, the map must be large enough to store every value.
86*a28cd43dSSascha Wildner * The map implements linear probing and keeps its load less than 0.5.
87*a28cd43dSSascha Wildner */
88*a28cd43dSSascha Wildner
89*a28cd43dSSascha Wildner #define MAP_EMPTY_VALUE ((U32)-1)
90*a28cd43dSSascha Wildner typedef struct COVER_map_pair_t_s {
91*a28cd43dSSascha Wildner U32 key;
92*a28cd43dSSascha Wildner U32 value;
93*a28cd43dSSascha Wildner } COVER_map_pair_t;
94*a28cd43dSSascha Wildner
95*a28cd43dSSascha Wildner typedef struct COVER_map_s {
96*a28cd43dSSascha Wildner COVER_map_pair_t *data;
97*a28cd43dSSascha Wildner U32 sizeLog;
98*a28cd43dSSascha Wildner U32 size;
99*a28cd43dSSascha Wildner U32 sizeMask;
100*a28cd43dSSascha Wildner } COVER_map_t;
101*a28cd43dSSascha Wildner
102*a28cd43dSSascha Wildner /**
103*a28cd43dSSascha Wildner * Clear the map.
104*a28cd43dSSascha Wildner */
COVER_map_clear(COVER_map_t * map)105*a28cd43dSSascha Wildner static void COVER_map_clear(COVER_map_t *map) {
106*a28cd43dSSascha Wildner memset(map->data, MAP_EMPTY_VALUE, map->size * sizeof(COVER_map_pair_t));
107*a28cd43dSSascha Wildner }
108*a28cd43dSSascha Wildner
109*a28cd43dSSascha Wildner /**
110*a28cd43dSSascha Wildner * Initializes a map of the given size.
111*a28cd43dSSascha Wildner * Returns 1 on success and 0 on failure.
112*a28cd43dSSascha Wildner * The map must be destroyed with COVER_map_destroy().
113*a28cd43dSSascha Wildner * The map is only guaranteed to be large enough to hold size elements.
114*a28cd43dSSascha Wildner */
COVER_map_init(COVER_map_t * map,U32 size)115*a28cd43dSSascha Wildner static int COVER_map_init(COVER_map_t *map, U32 size) {
116*a28cd43dSSascha Wildner map->sizeLog = ZSTD_highbit32(size) + 2;
117*a28cd43dSSascha Wildner map->size = (U32)1 << map->sizeLog;
118*a28cd43dSSascha Wildner map->sizeMask = map->size - 1;
119*a28cd43dSSascha Wildner map->data = (COVER_map_pair_t *)malloc(map->size * sizeof(COVER_map_pair_t));
120*a28cd43dSSascha Wildner if (!map->data) {
121*a28cd43dSSascha Wildner map->sizeLog = 0;
122*a28cd43dSSascha Wildner map->size = 0;
123*a28cd43dSSascha Wildner return 0;
124*a28cd43dSSascha Wildner }
125*a28cd43dSSascha Wildner COVER_map_clear(map);
126*a28cd43dSSascha Wildner return 1;
127*a28cd43dSSascha Wildner }
128*a28cd43dSSascha Wildner
129*a28cd43dSSascha Wildner /**
130*a28cd43dSSascha Wildner * Internal hash function
131*a28cd43dSSascha Wildner */
132*a28cd43dSSascha Wildner static const U32 COVER_prime4bytes = 2654435761U;
COVER_map_hash(COVER_map_t * map,U32 key)133*a28cd43dSSascha Wildner static U32 COVER_map_hash(COVER_map_t *map, U32 key) {
134*a28cd43dSSascha Wildner return (key * COVER_prime4bytes) >> (32 - map->sizeLog);
135*a28cd43dSSascha Wildner }
136*a28cd43dSSascha Wildner
137*a28cd43dSSascha Wildner /**
138*a28cd43dSSascha Wildner * Helper function that returns the index that a key should be placed into.
139*a28cd43dSSascha Wildner */
COVER_map_index(COVER_map_t * map,U32 key)140*a28cd43dSSascha Wildner static U32 COVER_map_index(COVER_map_t *map, U32 key) {
141*a28cd43dSSascha Wildner const U32 hash = COVER_map_hash(map, key);
142*a28cd43dSSascha Wildner U32 i;
143*a28cd43dSSascha Wildner for (i = hash;; i = (i + 1) & map->sizeMask) {
144*a28cd43dSSascha Wildner COVER_map_pair_t *pos = &map->data[i];
145*a28cd43dSSascha Wildner if (pos->value == MAP_EMPTY_VALUE) {
146*a28cd43dSSascha Wildner return i;
147*a28cd43dSSascha Wildner }
148*a28cd43dSSascha Wildner if (pos->key == key) {
149*a28cd43dSSascha Wildner return i;
150*a28cd43dSSascha Wildner }
151*a28cd43dSSascha Wildner }
152*a28cd43dSSascha Wildner }
153*a28cd43dSSascha Wildner
154*a28cd43dSSascha Wildner /**
155*a28cd43dSSascha Wildner * Returns the pointer to the value for key.
156*a28cd43dSSascha Wildner * If key is not in the map, it is inserted and the value is set to 0.
157*a28cd43dSSascha Wildner * The map must not be full.
158*a28cd43dSSascha Wildner */
COVER_map_at(COVER_map_t * map,U32 key)159*a28cd43dSSascha Wildner static U32 *COVER_map_at(COVER_map_t *map, U32 key) {
160*a28cd43dSSascha Wildner COVER_map_pair_t *pos = &map->data[COVER_map_index(map, key)];
161*a28cd43dSSascha Wildner if (pos->value == MAP_EMPTY_VALUE) {
162*a28cd43dSSascha Wildner pos->key = key;
163*a28cd43dSSascha Wildner pos->value = 0;
164*a28cd43dSSascha Wildner }
165*a28cd43dSSascha Wildner return &pos->value;
166*a28cd43dSSascha Wildner }
167*a28cd43dSSascha Wildner
168*a28cd43dSSascha Wildner /**
169*a28cd43dSSascha Wildner * Deletes key from the map if present.
170*a28cd43dSSascha Wildner */
COVER_map_remove(COVER_map_t * map,U32 key)171*a28cd43dSSascha Wildner static void COVER_map_remove(COVER_map_t *map, U32 key) {
172*a28cd43dSSascha Wildner U32 i = COVER_map_index(map, key);
173*a28cd43dSSascha Wildner COVER_map_pair_t *del = &map->data[i];
174*a28cd43dSSascha Wildner U32 shift = 1;
175*a28cd43dSSascha Wildner if (del->value == MAP_EMPTY_VALUE) {
176*a28cd43dSSascha Wildner return;
177*a28cd43dSSascha Wildner }
178*a28cd43dSSascha Wildner for (i = (i + 1) & map->sizeMask;; i = (i + 1) & map->sizeMask) {
179*a28cd43dSSascha Wildner COVER_map_pair_t *const pos = &map->data[i];
180*a28cd43dSSascha Wildner /* If the position is empty we are done */
181*a28cd43dSSascha Wildner if (pos->value == MAP_EMPTY_VALUE) {
182*a28cd43dSSascha Wildner del->value = MAP_EMPTY_VALUE;
183*a28cd43dSSascha Wildner return;
184*a28cd43dSSascha Wildner }
185*a28cd43dSSascha Wildner /* If pos can be moved to del do so */
186*a28cd43dSSascha Wildner if (((i - COVER_map_hash(map, pos->key)) & map->sizeMask) >= shift) {
187*a28cd43dSSascha Wildner del->key = pos->key;
188*a28cd43dSSascha Wildner del->value = pos->value;
189*a28cd43dSSascha Wildner del = pos;
190*a28cd43dSSascha Wildner shift = 1;
191*a28cd43dSSascha Wildner } else {
192*a28cd43dSSascha Wildner ++shift;
193*a28cd43dSSascha Wildner }
194*a28cd43dSSascha Wildner }
195*a28cd43dSSascha Wildner }
196*a28cd43dSSascha Wildner
197*a28cd43dSSascha Wildner /**
198*a28cd43dSSascha Wildner * Destroys a map that is inited with COVER_map_init().
199*a28cd43dSSascha Wildner */
COVER_map_destroy(COVER_map_t * map)200*a28cd43dSSascha Wildner static void COVER_map_destroy(COVER_map_t *map) {
201*a28cd43dSSascha Wildner if (map->data) {
202*a28cd43dSSascha Wildner free(map->data);
203*a28cd43dSSascha Wildner }
204*a28cd43dSSascha Wildner map->data = NULL;
205*a28cd43dSSascha Wildner map->size = 0;
206*a28cd43dSSascha Wildner }
207*a28cd43dSSascha Wildner
208*a28cd43dSSascha Wildner /*-*************************************
209*a28cd43dSSascha Wildner * Context
210*a28cd43dSSascha Wildner ***************************************/
211*a28cd43dSSascha Wildner
212*a28cd43dSSascha Wildner typedef struct {
213*a28cd43dSSascha Wildner const BYTE *samples;
214*a28cd43dSSascha Wildner size_t *offsets;
215*a28cd43dSSascha Wildner const size_t *samplesSizes;
216*a28cd43dSSascha Wildner size_t nbSamples;
217*a28cd43dSSascha Wildner size_t nbTrainSamples;
218*a28cd43dSSascha Wildner size_t nbTestSamples;
219*a28cd43dSSascha Wildner U32 *suffix;
220*a28cd43dSSascha Wildner size_t suffixSize;
221*a28cd43dSSascha Wildner U32 *freqs;
222*a28cd43dSSascha Wildner U32 *dmerAt;
223*a28cd43dSSascha Wildner unsigned d;
224*a28cd43dSSascha Wildner } COVER_ctx_t;
225*a28cd43dSSascha Wildner
226*a28cd43dSSascha Wildner /* We need a global context for qsort... */
227*a28cd43dSSascha Wildner static COVER_ctx_t *g_coverCtx = NULL;
228*a28cd43dSSascha Wildner
229*a28cd43dSSascha Wildner /*-*************************************
230*a28cd43dSSascha Wildner * Helper functions
231*a28cd43dSSascha Wildner ***************************************/
232*a28cd43dSSascha Wildner
233*a28cd43dSSascha Wildner /**
234*a28cd43dSSascha Wildner * Returns the sum of the sample sizes.
235*a28cd43dSSascha Wildner */
COVER_sum(const size_t * samplesSizes,unsigned nbSamples)236*a28cd43dSSascha Wildner size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
237*a28cd43dSSascha Wildner size_t sum = 0;
238*a28cd43dSSascha Wildner unsigned i;
239*a28cd43dSSascha Wildner for (i = 0; i < nbSamples; ++i) {
240*a28cd43dSSascha Wildner sum += samplesSizes[i];
241*a28cd43dSSascha Wildner }
242*a28cd43dSSascha Wildner return sum;
243*a28cd43dSSascha Wildner }
244*a28cd43dSSascha Wildner
245*a28cd43dSSascha Wildner /**
246*a28cd43dSSascha Wildner * Returns -1 if the dmer at lp is less than the dmer at rp.
247*a28cd43dSSascha Wildner * Return 0 if the dmers at lp and rp are equal.
248*a28cd43dSSascha Wildner * Returns 1 if the dmer at lp is greater than the dmer at rp.
249*a28cd43dSSascha Wildner */
COVER_cmp(COVER_ctx_t * ctx,const void * lp,const void * rp)250*a28cd43dSSascha Wildner static int COVER_cmp(COVER_ctx_t *ctx, const void *lp, const void *rp) {
251*a28cd43dSSascha Wildner U32 const lhs = *(U32 const *)lp;
252*a28cd43dSSascha Wildner U32 const rhs = *(U32 const *)rp;
253*a28cd43dSSascha Wildner return memcmp(ctx->samples + lhs, ctx->samples + rhs, ctx->d);
254*a28cd43dSSascha Wildner }
255*a28cd43dSSascha Wildner /**
256*a28cd43dSSascha Wildner * Faster version for d <= 8.
257*a28cd43dSSascha Wildner */
COVER_cmp8(COVER_ctx_t * ctx,const void * lp,const void * rp)258*a28cd43dSSascha Wildner static int COVER_cmp8(COVER_ctx_t *ctx, const void *lp, const void *rp) {
259*a28cd43dSSascha Wildner U64 const mask = (ctx->d == 8) ? (U64)-1 : (((U64)1 << (8 * ctx->d)) - 1);
260*a28cd43dSSascha Wildner U64 const lhs = MEM_readLE64(ctx->samples + *(U32 const *)lp) & mask;
261*a28cd43dSSascha Wildner U64 const rhs = MEM_readLE64(ctx->samples + *(U32 const *)rp) & mask;
262*a28cd43dSSascha Wildner if (lhs < rhs) {
263*a28cd43dSSascha Wildner return -1;
264*a28cd43dSSascha Wildner }
265*a28cd43dSSascha Wildner return (lhs > rhs);
266*a28cd43dSSascha Wildner }
267*a28cd43dSSascha Wildner
268*a28cd43dSSascha Wildner /**
269*a28cd43dSSascha Wildner * Same as COVER_cmp() except ties are broken by pointer value
270*a28cd43dSSascha Wildner * NOTE: g_coverCtx must be set to call this function. A global is required because
271*a28cd43dSSascha Wildner * qsort doesn't take an opaque pointer.
272*a28cd43dSSascha Wildner */
COVER_strict_cmp(const void * lp,const void * rp)273*a28cd43dSSascha Wildner static int WIN_CDECL COVER_strict_cmp(const void *lp, const void *rp) {
274*a28cd43dSSascha Wildner int result = COVER_cmp(g_coverCtx, lp, rp);
275*a28cd43dSSascha Wildner if (result == 0) {
276*a28cd43dSSascha Wildner result = lp < rp ? -1 : 1;
277*a28cd43dSSascha Wildner }
278*a28cd43dSSascha Wildner return result;
279*a28cd43dSSascha Wildner }
280*a28cd43dSSascha Wildner /**
281*a28cd43dSSascha Wildner * Faster version for d <= 8.
282*a28cd43dSSascha Wildner */
COVER_strict_cmp8(const void * lp,const void * rp)283*a28cd43dSSascha Wildner static int WIN_CDECL COVER_strict_cmp8(const void *lp, const void *rp) {
284*a28cd43dSSascha Wildner int result = COVER_cmp8(g_coverCtx, lp, rp);
285*a28cd43dSSascha Wildner if (result == 0) {
286*a28cd43dSSascha Wildner result = lp < rp ? -1 : 1;
287*a28cd43dSSascha Wildner }
288*a28cd43dSSascha Wildner return result;
289*a28cd43dSSascha Wildner }
290*a28cd43dSSascha Wildner
291*a28cd43dSSascha Wildner /**
292*a28cd43dSSascha Wildner * Returns the first pointer in [first, last) whose element does not compare
293*a28cd43dSSascha Wildner * less than value. If no such element exists it returns last.
294*a28cd43dSSascha Wildner */
COVER_lower_bound(const size_t * first,const size_t * last,size_t value)295*a28cd43dSSascha Wildner static const size_t *COVER_lower_bound(const size_t *first, const size_t *last,
296*a28cd43dSSascha Wildner size_t value) {
297*a28cd43dSSascha Wildner size_t count = last - first;
298*a28cd43dSSascha Wildner while (count != 0) {
299*a28cd43dSSascha Wildner size_t step = count / 2;
300*a28cd43dSSascha Wildner const size_t *ptr = first;
301*a28cd43dSSascha Wildner ptr += step;
302*a28cd43dSSascha Wildner if (*ptr < value) {
303*a28cd43dSSascha Wildner first = ++ptr;
304*a28cd43dSSascha Wildner count -= step + 1;
305*a28cd43dSSascha Wildner } else {
306*a28cd43dSSascha Wildner count = step;
307*a28cd43dSSascha Wildner }
308*a28cd43dSSascha Wildner }
309*a28cd43dSSascha Wildner return first;
310*a28cd43dSSascha Wildner }
311*a28cd43dSSascha Wildner
312*a28cd43dSSascha Wildner /**
313*a28cd43dSSascha Wildner * Generic groupBy function.
314*a28cd43dSSascha Wildner * Groups an array sorted by cmp into groups with equivalent values.
315*a28cd43dSSascha Wildner * Calls grp for each group.
316*a28cd43dSSascha Wildner */
317*a28cd43dSSascha Wildner static void
COVER_groupBy(const void * data,size_t count,size_t size,COVER_ctx_t * ctx,int (* cmp)(COVER_ctx_t *,const void *,const void *),void (* grp)(COVER_ctx_t *,const void *,const void *))318*a28cd43dSSascha Wildner COVER_groupBy(const void *data, size_t count, size_t size, COVER_ctx_t *ctx,
319*a28cd43dSSascha Wildner int (*cmp)(COVER_ctx_t *, const void *, const void *),
320*a28cd43dSSascha Wildner void (*grp)(COVER_ctx_t *, const void *, const void *)) {
321*a28cd43dSSascha Wildner const BYTE *ptr = (const BYTE *)data;
322*a28cd43dSSascha Wildner size_t num = 0;
323*a28cd43dSSascha Wildner while (num < count) {
324*a28cd43dSSascha Wildner const BYTE *grpEnd = ptr + size;
325*a28cd43dSSascha Wildner ++num;
326*a28cd43dSSascha Wildner while (num < count && cmp(ctx, ptr, grpEnd) == 0) {
327*a28cd43dSSascha Wildner grpEnd += size;
328*a28cd43dSSascha Wildner ++num;
329*a28cd43dSSascha Wildner }
330*a28cd43dSSascha Wildner grp(ctx, ptr, grpEnd);
331*a28cd43dSSascha Wildner ptr = grpEnd;
332*a28cd43dSSascha Wildner }
333*a28cd43dSSascha Wildner }
334*a28cd43dSSascha Wildner
335*a28cd43dSSascha Wildner /*-*************************************
336*a28cd43dSSascha Wildner * Cover functions
337*a28cd43dSSascha Wildner ***************************************/
338*a28cd43dSSascha Wildner
339*a28cd43dSSascha Wildner /**
340*a28cd43dSSascha Wildner * Called on each group of positions with the same dmer.
341*a28cd43dSSascha Wildner * Counts the frequency of each dmer and saves it in the suffix array.
342*a28cd43dSSascha Wildner * Fills `ctx->dmerAt`.
343*a28cd43dSSascha Wildner */
COVER_group(COVER_ctx_t * ctx,const void * group,const void * groupEnd)344*a28cd43dSSascha Wildner static void COVER_group(COVER_ctx_t *ctx, const void *group,
345*a28cd43dSSascha Wildner const void *groupEnd) {
346*a28cd43dSSascha Wildner /* The group consists of all the positions with the same first d bytes. */
347*a28cd43dSSascha Wildner const U32 *grpPtr = (const U32 *)group;
348*a28cd43dSSascha Wildner const U32 *grpEnd = (const U32 *)groupEnd;
349*a28cd43dSSascha Wildner /* The dmerId is how we will reference this dmer.
350*a28cd43dSSascha Wildner * This allows us to map the whole dmer space to a much smaller space, the
351*a28cd43dSSascha Wildner * size of the suffix array.
352*a28cd43dSSascha Wildner */
353*a28cd43dSSascha Wildner const U32 dmerId = (U32)(grpPtr - ctx->suffix);
354*a28cd43dSSascha Wildner /* Count the number of samples this dmer shows up in */
355*a28cd43dSSascha Wildner U32 freq = 0;
356*a28cd43dSSascha Wildner /* Details */
357*a28cd43dSSascha Wildner const size_t *curOffsetPtr = ctx->offsets;
358*a28cd43dSSascha Wildner const size_t *offsetsEnd = ctx->offsets + ctx->nbSamples;
359*a28cd43dSSascha Wildner /* Once *grpPtr >= curSampleEnd this occurrence of the dmer is in a
360*a28cd43dSSascha Wildner * different sample than the last.
361*a28cd43dSSascha Wildner */
362*a28cd43dSSascha Wildner size_t curSampleEnd = ctx->offsets[0];
363*a28cd43dSSascha Wildner for (; grpPtr != grpEnd; ++grpPtr) {
364*a28cd43dSSascha Wildner /* Save the dmerId for this position so we can get back to it. */
365*a28cd43dSSascha Wildner ctx->dmerAt[*grpPtr] = dmerId;
366*a28cd43dSSascha Wildner /* Dictionaries only help for the first reference to the dmer.
367*a28cd43dSSascha Wildner * After that zstd can reference the match from the previous reference.
368*a28cd43dSSascha Wildner * So only count each dmer once for each sample it is in.
369*a28cd43dSSascha Wildner */
370*a28cd43dSSascha Wildner if (*grpPtr < curSampleEnd) {
371*a28cd43dSSascha Wildner continue;
372*a28cd43dSSascha Wildner }
373*a28cd43dSSascha Wildner freq += 1;
374*a28cd43dSSascha Wildner /* Binary search to find the end of the sample *grpPtr is in.
375*a28cd43dSSascha Wildner * In the common case that grpPtr + 1 == grpEnd we can skip the binary
376*a28cd43dSSascha Wildner * search because the loop is over.
377*a28cd43dSSascha Wildner */
378*a28cd43dSSascha Wildner if (grpPtr + 1 != grpEnd) {
379*a28cd43dSSascha Wildner const size_t *sampleEndPtr =
380*a28cd43dSSascha Wildner COVER_lower_bound(curOffsetPtr, offsetsEnd, *grpPtr);
381*a28cd43dSSascha Wildner curSampleEnd = *sampleEndPtr;
382*a28cd43dSSascha Wildner curOffsetPtr = sampleEndPtr + 1;
383*a28cd43dSSascha Wildner }
384*a28cd43dSSascha Wildner }
385*a28cd43dSSascha Wildner /* At this point we are never going to look at this segment of the suffix
386*a28cd43dSSascha Wildner * array again. We take advantage of this fact to save memory.
387*a28cd43dSSascha Wildner * We store the frequency of the dmer in the first position of the group,
388*a28cd43dSSascha Wildner * which is dmerId.
389*a28cd43dSSascha Wildner */
390*a28cd43dSSascha Wildner ctx->suffix[dmerId] = freq;
391*a28cd43dSSascha Wildner }
392*a28cd43dSSascha Wildner
393*a28cd43dSSascha Wildner
394*a28cd43dSSascha Wildner /**
395*a28cd43dSSascha Wildner * Selects the best segment in an epoch.
396*a28cd43dSSascha Wildner * Segments of are scored according to the function:
397*a28cd43dSSascha Wildner *
398*a28cd43dSSascha Wildner * Let F(d) be the frequency of dmer d.
399*a28cd43dSSascha Wildner * Let S_i be the dmer at position i of segment S which has length k.
400*a28cd43dSSascha Wildner *
401*a28cd43dSSascha Wildner * Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
402*a28cd43dSSascha Wildner *
403*a28cd43dSSascha Wildner * Once the dmer d is in the dictionary we set F(d) = 0.
404*a28cd43dSSascha Wildner */
COVER_selectSegment(const COVER_ctx_t * ctx,U32 * freqs,COVER_map_t * activeDmers,U32 begin,U32 end,ZDICT_cover_params_t parameters)405*a28cd43dSSascha Wildner static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
406*a28cd43dSSascha Wildner COVER_map_t *activeDmers, U32 begin,
407*a28cd43dSSascha Wildner U32 end,
408*a28cd43dSSascha Wildner ZDICT_cover_params_t parameters) {
409*a28cd43dSSascha Wildner /* Constants */
410*a28cd43dSSascha Wildner const U32 k = parameters.k;
411*a28cd43dSSascha Wildner const U32 d = parameters.d;
412*a28cd43dSSascha Wildner const U32 dmersInK = k - d + 1;
413*a28cd43dSSascha Wildner /* Try each segment (activeSegment) and save the best (bestSegment) */
414*a28cd43dSSascha Wildner COVER_segment_t bestSegment = {0, 0, 0};
415*a28cd43dSSascha Wildner COVER_segment_t activeSegment;
416*a28cd43dSSascha Wildner /* Reset the activeDmers in the segment */
417*a28cd43dSSascha Wildner COVER_map_clear(activeDmers);
418*a28cd43dSSascha Wildner /* The activeSegment starts at the beginning of the epoch. */
419*a28cd43dSSascha Wildner activeSegment.begin = begin;
420*a28cd43dSSascha Wildner activeSegment.end = begin;
421*a28cd43dSSascha Wildner activeSegment.score = 0;
422*a28cd43dSSascha Wildner /* Slide the activeSegment through the whole epoch.
423*a28cd43dSSascha Wildner * Save the best segment in bestSegment.
424*a28cd43dSSascha Wildner */
425*a28cd43dSSascha Wildner while (activeSegment.end < end) {
426*a28cd43dSSascha Wildner /* The dmerId for the dmer at the next position */
427*a28cd43dSSascha Wildner U32 newDmer = ctx->dmerAt[activeSegment.end];
428*a28cd43dSSascha Wildner /* The entry in activeDmers for this dmerId */
429*a28cd43dSSascha Wildner U32 *newDmerOcc = COVER_map_at(activeDmers, newDmer);
430*a28cd43dSSascha Wildner /* If the dmer isn't already present in the segment add its score. */
431*a28cd43dSSascha Wildner if (*newDmerOcc == 0) {
432*a28cd43dSSascha Wildner /* The paper suggest using the L-0.5 norm, but experiments show that it
433*a28cd43dSSascha Wildner * doesn't help.
434*a28cd43dSSascha Wildner */
435*a28cd43dSSascha Wildner activeSegment.score += freqs[newDmer];
436*a28cd43dSSascha Wildner }
437*a28cd43dSSascha Wildner /* Add the dmer to the segment */
438*a28cd43dSSascha Wildner activeSegment.end += 1;
439*a28cd43dSSascha Wildner *newDmerOcc += 1;
440*a28cd43dSSascha Wildner
441*a28cd43dSSascha Wildner /* If the window is now too large, drop the first position */
442*a28cd43dSSascha Wildner if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
443*a28cd43dSSascha Wildner U32 delDmer = ctx->dmerAt[activeSegment.begin];
444*a28cd43dSSascha Wildner U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer);
445*a28cd43dSSascha Wildner activeSegment.begin += 1;
446*a28cd43dSSascha Wildner *delDmerOcc -= 1;
447*a28cd43dSSascha Wildner /* If this is the last occurrence of the dmer, subtract its score */
448*a28cd43dSSascha Wildner if (*delDmerOcc == 0) {
449*a28cd43dSSascha Wildner COVER_map_remove(activeDmers, delDmer);
450*a28cd43dSSascha Wildner activeSegment.score -= freqs[delDmer];
451*a28cd43dSSascha Wildner }
452*a28cd43dSSascha Wildner }
453*a28cd43dSSascha Wildner
454*a28cd43dSSascha Wildner /* If this segment is the best so far save it */
455*a28cd43dSSascha Wildner if (activeSegment.score > bestSegment.score) {
456*a28cd43dSSascha Wildner bestSegment = activeSegment;
457*a28cd43dSSascha Wildner }
458*a28cd43dSSascha Wildner }
459*a28cd43dSSascha Wildner {
460*a28cd43dSSascha Wildner /* Trim off the zero frequency head and tail from the segment. */
461*a28cd43dSSascha Wildner U32 newBegin = bestSegment.end;
462*a28cd43dSSascha Wildner U32 newEnd = bestSegment.begin;
463*a28cd43dSSascha Wildner U32 pos;
464*a28cd43dSSascha Wildner for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
465*a28cd43dSSascha Wildner U32 freq = freqs[ctx->dmerAt[pos]];
466*a28cd43dSSascha Wildner if (freq != 0) {
467*a28cd43dSSascha Wildner newBegin = MIN(newBegin, pos);
468*a28cd43dSSascha Wildner newEnd = pos + 1;
469*a28cd43dSSascha Wildner }
470*a28cd43dSSascha Wildner }
471*a28cd43dSSascha Wildner bestSegment.begin = newBegin;
472*a28cd43dSSascha Wildner bestSegment.end = newEnd;
473*a28cd43dSSascha Wildner }
474*a28cd43dSSascha Wildner {
475*a28cd43dSSascha Wildner /* Zero out the frequency of each dmer covered by the chosen segment. */
476*a28cd43dSSascha Wildner U32 pos;
477*a28cd43dSSascha Wildner for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
478*a28cd43dSSascha Wildner freqs[ctx->dmerAt[pos]] = 0;
479*a28cd43dSSascha Wildner }
480*a28cd43dSSascha Wildner }
481*a28cd43dSSascha Wildner return bestSegment;
482*a28cd43dSSascha Wildner }
483*a28cd43dSSascha Wildner
484*a28cd43dSSascha Wildner /**
485*a28cd43dSSascha Wildner * Check the validity of the parameters.
486*a28cd43dSSascha Wildner * Returns non-zero if the parameters are valid and 0 otherwise.
487*a28cd43dSSascha Wildner */
COVER_checkParameters(ZDICT_cover_params_t parameters,size_t maxDictSize)488*a28cd43dSSascha Wildner static int COVER_checkParameters(ZDICT_cover_params_t parameters,
489*a28cd43dSSascha Wildner size_t maxDictSize) {
490*a28cd43dSSascha Wildner /* k and d are required parameters */
491*a28cd43dSSascha Wildner if (parameters.d == 0 || parameters.k == 0) {
492*a28cd43dSSascha Wildner return 0;
493*a28cd43dSSascha Wildner }
494*a28cd43dSSascha Wildner /* k <= maxDictSize */
495*a28cd43dSSascha Wildner if (parameters.k > maxDictSize) {
496*a28cd43dSSascha Wildner return 0;
497*a28cd43dSSascha Wildner }
498*a28cd43dSSascha Wildner /* d <= k */
499*a28cd43dSSascha Wildner if (parameters.d > parameters.k) {
500*a28cd43dSSascha Wildner return 0;
501*a28cd43dSSascha Wildner }
502*a28cd43dSSascha Wildner /* 0 < splitPoint <= 1 */
503*a28cd43dSSascha Wildner if (parameters.splitPoint <= 0 || parameters.splitPoint > 1){
504*a28cd43dSSascha Wildner return 0;
505*a28cd43dSSascha Wildner }
506*a28cd43dSSascha Wildner return 1;
507*a28cd43dSSascha Wildner }
508*a28cd43dSSascha Wildner
509*a28cd43dSSascha Wildner /**
510*a28cd43dSSascha Wildner * Clean up a context initialized with `COVER_ctx_init()`.
511*a28cd43dSSascha Wildner */
COVER_ctx_destroy(COVER_ctx_t * ctx)512*a28cd43dSSascha Wildner static void COVER_ctx_destroy(COVER_ctx_t *ctx) {
513*a28cd43dSSascha Wildner if (!ctx) {
514*a28cd43dSSascha Wildner return;
515*a28cd43dSSascha Wildner }
516*a28cd43dSSascha Wildner if (ctx->suffix) {
517*a28cd43dSSascha Wildner free(ctx->suffix);
518*a28cd43dSSascha Wildner ctx->suffix = NULL;
519*a28cd43dSSascha Wildner }
520*a28cd43dSSascha Wildner if (ctx->freqs) {
521*a28cd43dSSascha Wildner free(ctx->freqs);
522*a28cd43dSSascha Wildner ctx->freqs = NULL;
523*a28cd43dSSascha Wildner }
524*a28cd43dSSascha Wildner if (ctx->dmerAt) {
525*a28cd43dSSascha Wildner free(ctx->dmerAt);
526*a28cd43dSSascha Wildner ctx->dmerAt = NULL;
527*a28cd43dSSascha Wildner }
528*a28cd43dSSascha Wildner if (ctx->offsets) {
529*a28cd43dSSascha Wildner free(ctx->offsets);
530*a28cd43dSSascha Wildner ctx->offsets = NULL;
531*a28cd43dSSascha Wildner }
532*a28cd43dSSascha Wildner }
533*a28cd43dSSascha Wildner
534*a28cd43dSSascha Wildner /**
535*a28cd43dSSascha Wildner * Prepare a context for dictionary building.
536*a28cd43dSSascha Wildner * The context is only dependent on the parameter `d` and can used multiple
537*a28cd43dSSascha Wildner * times.
538*a28cd43dSSascha Wildner * Returns 0 on success or error code on error.
539*a28cd43dSSascha Wildner * The context must be destroyed with `COVER_ctx_destroy()`.
540*a28cd43dSSascha Wildner */
COVER_ctx_init(COVER_ctx_t * ctx,const void * samplesBuffer,const size_t * samplesSizes,unsigned nbSamples,unsigned d,double splitPoint)541*a28cd43dSSascha Wildner static size_t COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
542*a28cd43dSSascha Wildner const size_t *samplesSizes, unsigned nbSamples,
543*a28cd43dSSascha Wildner unsigned d, double splitPoint) {
544*a28cd43dSSascha Wildner const BYTE *const samples = (const BYTE *)samplesBuffer;
545*a28cd43dSSascha Wildner const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
546*a28cd43dSSascha Wildner /* Split samples into testing and training sets */
547*a28cd43dSSascha Wildner const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;
548*a28cd43dSSascha Wildner const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
549*a28cd43dSSascha Wildner const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
550*a28cd43dSSascha Wildner const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;
551*a28cd43dSSascha Wildner /* Checks */
552*a28cd43dSSascha Wildner if (totalSamplesSize < MAX(d, sizeof(U64)) ||
553*a28cd43dSSascha Wildner totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
554*a28cd43dSSascha Wildner DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
555*a28cd43dSSascha Wildner (unsigned)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));
556*a28cd43dSSascha Wildner return ERROR(srcSize_wrong);
557*a28cd43dSSascha Wildner }
558*a28cd43dSSascha Wildner /* Check if there are at least 5 training samples */
559*a28cd43dSSascha Wildner if (nbTrainSamples < 5) {
560*a28cd43dSSascha Wildner DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
561*a28cd43dSSascha Wildner return ERROR(srcSize_wrong);
562*a28cd43dSSascha Wildner }
563*a28cd43dSSascha Wildner /* Check if there's testing sample */
564*a28cd43dSSascha Wildner if (nbTestSamples < 1) {
565*a28cd43dSSascha Wildner DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);
566*a28cd43dSSascha Wildner return ERROR(srcSize_wrong);
567*a28cd43dSSascha Wildner }
568*a28cd43dSSascha Wildner /* Zero the context */
569*a28cd43dSSascha Wildner memset(ctx, 0, sizeof(*ctx));
570*a28cd43dSSascha Wildner DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,
571*a28cd43dSSascha Wildner (unsigned)trainingSamplesSize);
572*a28cd43dSSascha Wildner DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,
573*a28cd43dSSascha Wildner (unsigned)testSamplesSize);
574*a28cd43dSSascha Wildner ctx->samples = samples;
575*a28cd43dSSascha Wildner ctx->samplesSizes = samplesSizes;
576*a28cd43dSSascha Wildner ctx->nbSamples = nbSamples;
577*a28cd43dSSascha Wildner ctx->nbTrainSamples = nbTrainSamples;
578*a28cd43dSSascha Wildner ctx->nbTestSamples = nbTestSamples;
579*a28cd43dSSascha Wildner /* Partial suffix array */
580*a28cd43dSSascha Wildner ctx->suffixSize = trainingSamplesSize - MAX(d, sizeof(U64)) + 1;
581*a28cd43dSSascha Wildner ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
582*a28cd43dSSascha Wildner /* Maps index to the dmerID */
583*a28cd43dSSascha Wildner ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
584*a28cd43dSSascha Wildner /* The offsets of each file */
585*a28cd43dSSascha Wildner ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t));
586*a28cd43dSSascha Wildner if (!ctx->suffix || !ctx->dmerAt || !ctx->offsets) {
587*a28cd43dSSascha Wildner DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n");
588*a28cd43dSSascha Wildner COVER_ctx_destroy(ctx);
589*a28cd43dSSascha Wildner return ERROR(memory_allocation);
590*a28cd43dSSascha Wildner }
591*a28cd43dSSascha Wildner ctx->freqs = NULL;
592*a28cd43dSSascha Wildner ctx->d = d;
593*a28cd43dSSascha Wildner
594*a28cd43dSSascha Wildner /* Fill offsets from the samplesSizes */
595*a28cd43dSSascha Wildner {
596*a28cd43dSSascha Wildner U32 i;
597*a28cd43dSSascha Wildner ctx->offsets[0] = 0;
598*a28cd43dSSascha Wildner for (i = 1; i <= nbSamples; ++i) {
599*a28cd43dSSascha Wildner ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
600*a28cd43dSSascha Wildner }
601*a28cd43dSSascha Wildner }
602*a28cd43dSSascha Wildner DISPLAYLEVEL(2, "Constructing partial suffix array\n");
603*a28cd43dSSascha Wildner {
604*a28cd43dSSascha Wildner /* suffix is a partial suffix array.
605*a28cd43dSSascha Wildner * It only sorts suffixes by their first parameters.d bytes.
606*a28cd43dSSascha Wildner * The sort is stable, so each dmer group is sorted by position in input.
607*a28cd43dSSascha Wildner */
608*a28cd43dSSascha Wildner U32 i;
609*a28cd43dSSascha Wildner for (i = 0; i < ctx->suffixSize; ++i) {
610*a28cd43dSSascha Wildner ctx->suffix[i] = i;
611*a28cd43dSSascha Wildner }
612*a28cd43dSSascha Wildner /* qsort doesn't take an opaque pointer, so pass as a global.
613*a28cd43dSSascha Wildner * On OpenBSD qsort() is not guaranteed to be stable, their mergesort() is.
614*a28cd43dSSascha Wildner */
615*a28cd43dSSascha Wildner g_coverCtx = ctx;
616*a28cd43dSSascha Wildner #if defined(__OpenBSD__)
617*a28cd43dSSascha Wildner mergesort(ctx->suffix, ctx->suffixSize, sizeof(U32),
618*a28cd43dSSascha Wildner (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
619*a28cd43dSSascha Wildner #else
620*a28cd43dSSascha Wildner qsort(ctx->suffix, ctx->suffixSize, sizeof(U32),
621*a28cd43dSSascha Wildner (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
622*a28cd43dSSascha Wildner #endif
623*a28cd43dSSascha Wildner }
624*a28cd43dSSascha Wildner DISPLAYLEVEL(2, "Computing frequencies\n");
625*a28cd43dSSascha Wildner /* For each dmer group (group of positions with the same first d bytes):
626*a28cd43dSSascha Wildner * 1. For each position we set dmerAt[position] = dmerID. The dmerID is
627*a28cd43dSSascha Wildner * (groupBeginPtr - suffix). This allows us to go from position to
628*a28cd43dSSascha Wildner * dmerID so we can look up values in freq.
629*a28cd43dSSascha Wildner * 2. We calculate how many samples the dmer occurs in and save it in
630*a28cd43dSSascha Wildner * freqs[dmerId].
631*a28cd43dSSascha Wildner */
632*a28cd43dSSascha Wildner COVER_groupBy(ctx->suffix, ctx->suffixSize, sizeof(U32), ctx,
633*a28cd43dSSascha Wildner (ctx->d <= 8 ? &COVER_cmp8 : &COVER_cmp), &COVER_group);
634*a28cd43dSSascha Wildner ctx->freqs = ctx->suffix;
635*a28cd43dSSascha Wildner ctx->suffix = NULL;
636*a28cd43dSSascha Wildner return 0;
637*a28cd43dSSascha Wildner }
638*a28cd43dSSascha Wildner
COVER_warnOnSmallCorpus(size_t maxDictSize,size_t nbDmers,int displayLevel)639*a28cd43dSSascha Wildner void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel)
640*a28cd43dSSascha Wildner {
641*a28cd43dSSascha Wildner const double ratio = (double)nbDmers / maxDictSize;
642*a28cd43dSSascha Wildner if (ratio >= 10) {
643*a28cd43dSSascha Wildner return;
644*a28cd43dSSascha Wildner }
645*a28cd43dSSascha Wildner LOCALDISPLAYLEVEL(displayLevel, 1,
646*a28cd43dSSascha Wildner "WARNING: The maximum dictionary size %u is too large "
647*a28cd43dSSascha Wildner "compared to the source size %u! "
648*a28cd43dSSascha Wildner "size(source)/size(dictionary) = %f, but it should be >= "
649*a28cd43dSSascha Wildner "10! This may lead to a subpar dictionary! We recommend "
650*a28cd43dSSascha Wildner "training on sources at least 10x, and preferably 100x "
651*a28cd43dSSascha Wildner "the size of the dictionary! \n", (U32)maxDictSize,
652*a28cd43dSSascha Wildner (U32)nbDmers, ratio);
653*a28cd43dSSascha Wildner }
654*a28cd43dSSascha Wildner
COVER_computeEpochs(U32 maxDictSize,U32 nbDmers,U32 k,U32 passes)655*a28cd43dSSascha Wildner COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize,
656*a28cd43dSSascha Wildner U32 nbDmers, U32 k, U32 passes)
657*a28cd43dSSascha Wildner {
658*a28cd43dSSascha Wildner const U32 minEpochSize = k * 10;
659*a28cd43dSSascha Wildner COVER_epoch_info_t epochs;
660*a28cd43dSSascha Wildner epochs.num = MAX(1, maxDictSize / k / passes);
661*a28cd43dSSascha Wildner epochs.size = nbDmers / epochs.num;
662*a28cd43dSSascha Wildner if (epochs.size >= minEpochSize) {
663*a28cd43dSSascha Wildner assert(epochs.size * epochs.num <= nbDmers);
664*a28cd43dSSascha Wildner return epochs;
665*a28cd43dSSascha Wildner }
666*a28cd43dSSascha Wildner epochs.size = MIN(minEpochSize, nbDmers);
667*a28cd43dSSascha Wildner epochs.num = nbDmers / epochs.size;
668*a28cd43dSSascha Wildner assert(epochs.size * epochs.num <= nbDmers);
669*a28cd43dSSascha Wildner return epochs;
670*a28cd43dSSascha Wildner }
671*a28cd43dSSascha Wildner
672*a28cd43dSSascha Wildner /**
673*a28cd43dSSascha Wildner * Given the prepared context build the dictionary.
674*a28cd43dSSascha Wildner */
COVER_buildDictionary(const COVER_ctx_t * ctx,U32 * freqs,COVER_map_t * activeDmers,void * dictBuffer,size_t dictBufferCapacity,ZDICT_cover_params_t parameters)675*a28cd43dSSascha Wildner static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
676*a28cd43dSSascha Wildner COVER_map_t *activeDmers, void *dictBuffer,
677*a28cd43dSSascha Wildner size_t dictBufferCapacity,
678*a28cd43dSSascha Wildner ZDICT_cover_params_t parameters) {
679*a28cd43dSSascha Wildner BYTE *const dict = (BYTE *)dictBuffer;
680*a28cd43dSSascha Wildner size_t tail = dictBufferCapacity;
681*a28cd43dSSascha Wildner /* Divide the data into epochs. We will select one segment from each epoch. */
682*a28cd43dSSascha Wildner const COVER_epoch_info_t epochs = COVER_computeEpochs(
683*a28cd43dSSascha Wildner (U32)dictBufferCapacity, (U32)ctx->suffixSize, parameters.k, 4);
684*a28cd43dSSascha Wildner const size_t maxZeroScoreRun = MAX(10, MIN(100, epochs.num >> 3));
685*a28cd43dSSascha Wildner size_t zeroScoreRun = 0;
686*a28cd43dSSascha Wildner size_t epoch;
687*a28cd43dSSascha Wildner DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
688*a28cd43dSSascha Wildner (U32)epochs.num, (U32)epochs.size);
689*a28cd43dSSascha Wildner /* Loop through the epochs until there are no more segments or the dictionary
690*a28cd43dSSascha Wildner * is full.
691*a28cd43dSSascha Wildner */
692*a28cd43dSSascha Wildner for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
693*a28cd43dSSascha Wildner const U32 epochBegin = (U32)(epoch * epochs.size);
694*a28cd43dSSascha Wildner const U32 epochEnd = epochBegin + epochs.size;
695*a28cd43dSSascha Wildner size_t segmentSize;
696*a28cd43dSSascha Wildner /* Select a segment */
697*a28cd43dSSascha Wildner COVER_segment_t segment = COVER_selectSegment(
698*a28cd43dSSascha Wildner ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);
699*a28cd43dSSascha Wildner /* If the segment covers no dmers, then we are out of content.
700*a28cd43dSSascha Wildner * There may be new content in other epochs, for continue for some time.
701*a28cd43dSSascha Wildner */
702*a28cd43dSSascha Wildner if (segment.score == 0) {
703*a28cd43dSSascha Wildner if (++zeroScoreRun >= maxZeroScoreRun) {
704*a28cd43dSSascha Wildner break;
705*a28cd43dSSascha Wildner }
706*a28cd43dSSascha Wildner continue;
707*a28cd43dSSascha Wildner }
708*a28cd43dSSascha Wildner zeroScoreRun = 0;
709*a28cd43dSSascha Wildner /* Trim the segment if necessary and if it is too small then we are done */
710*a28cd43dSSascha Wildner segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
711*a28cd43dSSascha Wildner if (segmentSize < parameters.d) {
712*a28cd43dSSascha Wildner break;
713*a28cd43dSSascha Wildner }
714*a28cd43dSSascha Wildner /* We fill the dictionary from the back to allow the best segments to be
715*a28cd43dSSascha Wildner * referenced with the smallest offsets.
716*a28cd43dSSascha Wildner */
717*a28cd43dSSascha Wildner tail -= segmentSize;
718*a28cd43dSSascha Wildner memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
719*a28cd43dSSascha Wildner DISPLAYUPDATE(
720*a28cd43dSSascha Wildner 2, "\r%u%% ",
721*a28cd43dSSascha Wildner (unsigned)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
722*a28cd43dSSascha Wildner }
723*a28cd43dSSascha Wildner DISPLAYLEVEL(2, "\r%79s\r", "");
724*a28cd43dSSascha Wildner return tail;
725*a28cd43dSSascha Wildner }
726*a28cd43dSSascha Wildner
ZDICT_trainFromBuffer_cover(void * dictBuffer,size_t dictBufferCapacity,const void * samplesBuffer,const size_t * samplesSizes,unsigned nbSamples,ZDICT_cover_params_t parameters)727*a28cd43dSSascha Wildner ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
728*a28cd43dSSascha Wildner void *dictBuffer, size_t dictBufferCapacity,
729*a28cd43dSSascha Wildner const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
730*a28cd43dSSascha Wildner ZDICT_cover_params_t parameters)
731*a28cd43dSSascha Wildner {
732*a28cd43dSSascha Wildner BYTE* const dict = (BYTE*)dictBuffer;
733*a28cd43dSSascha Wildner COVER_ctx_t ctx;
734*a28cd43dSSascha Wildner COVER_map_t activeDmers;
735*a28cd43dSSascha Wildner parameters.splitPoint = 1.0;
736*a28cd43dSSascha Wildner /* Initialize global data */
737*a28cd43dSSascha Wildner g_displayLevel = parameters.zParams.notificationLevel;
738*a28cd43dSSascha Wildner /* Checks */
739*a28cd43dSSascha Wildner if (!COVER_checkParameters(parameters, dictBufferCapacity)) {
740*a28cd43dSSascha Wildner DISPLAYLEVEL(1, "Cover parameters incorrect\n");
741*a28cd43dSSascha Wildner return ERROR(parameter_outOfBound);
742*a28cd43dSSascha Wildner }
743*a28cd43dSSascha Wildner if (nbSamples == 0) {
744*a28cd43dSSascha Wildner DISPLAYLEVEL(1, "Cover must have at least one input file\n");
745*a28cd43dSSascha Wildner return ERROR(srcSize_wrong);
746*a28cd43dSSascha Wildner }
747*a28cd43dSSascha Wildner if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
748*a28cd43dSSascha Wildner DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
749*a28cd43dSSascha Wildner ZDICT_DICTSIZE_MIN);
750*a28cd43dSSascha Wildner return ERROR(dstSize_tooSmall);
751*a28cd43dSSascha Wildner }
752*a28cd43dSSascha Wildner /* Initialize context and activeDmers */
753*a28cd43dSSascha Wildner {
754*a28cd43dSSascha Wildner size_t const initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
755*a28cd43dSSascha Wildner parameters.d, parameters.splitPoint);
756*a28cd43dSSascha Wildner if (ZSTD_isError(initVal)) {
757*a28cd43dSSascha Wildner return initVal;
758*a28cd43dSSascha Wildner }
759*a28cd43dSSascha Wildner }
760*a28cd43dSSascha Wildner COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, g_displayLevel);
761*a28cd43dSSascha Wildner if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
762*a28cd43dSSascha Wildner DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
763*a28cd43dSSascha Wildner COVER_ctx_destroy(&ctx);
764*a28cd43dSSascha Wildner return ERROR(memory_allocation);
765*a28cd43dSSascha Wildner }
766*a28cd43dSSascha Wildner
767*a28cd43dSSascha Wildner DISPLAYLEVEL(2, "Building dictionary\n");
768*a28cd43dSSascha Wildner {
769*a28cd43dSSascha Wildner const size_t tail =
770*a28cd43dSSascha Wildner COVER_buildDictionary(&ctx, ctx.freqs, &activeDmers, dictBuffer,
771*a28cd43dSSascha Wildner dictBufferCapacity, parameters);
772*a28cd43dSSascha Wildner const size_t dictionarySize = ZDICT_finalizeDictionary(
773*a28cd43dSSascha Wildner dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
774*a28cd43dSSascha Wildner samplesBuffer, samplesSizes, nbSamples, parameters.zParams);
775*a28cd43dSSascha Wildner if (!ZSTD_isError(dictionarySize)) {
776*a28cd43dSSascha Wildner DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
777*a28cd43dSSascha Wildner (unsigned)dictionarySize);
778*a28cd43dSSascha Wildner }
779*a28cd43dSSascha Wildner COVER_ctx_destroy(&ctx);
780*a28cd43dSSascha Wildner COVER_map_destroy(&activeDmers);
781*a28cd43dSSascha Wildner return dictionarySize;
782*a28cd43dSSascha Wildner }
783*a28cd43dSSascha Wildner }
784*a28cd43dSSascha Wildner
785*a28cd43dSSascha Wildner
786*a28cd43dSSascha Wildner
COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,const size_t * samplesSizes,const BYTE * samples,size_t * offsets,size_t nbTrainSamples,size_t nbSamples,BYTE * const dict,size_t dictBufferCapacity)787*a28cd43dSSascha Wildner size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
788*a28cd43dSSascha Wildner const size_t *samplesSizes, const BYTE *samples,
789*a28cd43dSSascha Wildner size_t *offsets,
790*a28cd43dSSascha Wildner size_t nbTrainSamples, size_t nbSamples,
791*a28cd43dSSascha Wildner BYTE *const dict, size_t dictBufferCapacity) {
792*a28cd43dSSascha Wildner size_t totalCompressedSize = ERROR(GENERIC);
793*a28cd43dSSascha Wildner /* Pointers */
794*a28cd43dSSascha Wildner ZSTD_CCtx *cctx;
795*a28cd43dSSascha Wildner ZSTD_CDict *cdict;
796*a28cd43dSSascha Wildner void *dst;
797*a28cd43dSSascha Wildner /* Local variables */
798*a28cd43dSSascha Wildner size_t dstCapacity;
799*a28cd43dSSascha Wildner size_t i;
800*a28cd43dSSascha Wildner /* Allocate dst with enough space to compress the maximum sized sample */
801*a28cd43dSSascha Wildner {
802*a28cd43dSSascha Wildner size_t maxSampleSize = 0;
803*a28cd43dSSascha Wildner i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
804*a28cd43dSSascha Wildner for (; i < nbSamples; ++i) {
805*a28cd43dSSascha Wildner maxSampleSize = MAX(samplesSizes[i], maxSampleSize);
806*a28cd43dSSascha Wildner }
807*a28cd43dSSascha Wildner dstCapacity = ZSTD_compressBound(maxSampleSize);
808*a28cd43dSSascha Wildner dst = malloc(dstCapacity);
809*a28cd43dSSascha Wildner }
810*a28cd43dSSascha Wildner /* Create the cctx and cdict */
811*a28cd43dSSascha Wildner cctx = ZSTD_createCCtx();
812*a28cd43dSSascha Wildner cdict = ZSTD_createCDict(dict, dictBufferCapacity,
813*a28cd43dSSascha Wildner parameters.zParams.compressionLevel);
814*a28cd43dSSascha Wildner if (!dst || !cctx || !cdict) {
815*a28cd43dSSascha Wildner goto _compressCleanup;
816*a28cd43dSSascha Wildner }
817*a28cd43dSSascha Wildner /* Compress each sample and sum their sizes (or error) */
818*a28cd43dSSascha Wildner totalCompressedSize = dictBufferCapacity;
819*a28cd43dSSascha Wildner i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
820*a28cd43dSSascha Wildner for (; i < nbSamples; ++i) {
821*a28cd43dSSascha Wildner const size_t size = ZSTD_compress_usingCDict(
822*a28cd43dSSascha Wildner cctx, dst, dstCapacity, samples + offsets[i],
823*a28cd43dSSascha Wildner samplesSizes[i], cdict);
824*a28cd43dSSascha Wildner if (ZSTD_isError(size)) {
825*a28cd43dSSascha Wildner totalCompressedSize = size;
826*a28cd43dSSascha Wildner goto _compressCleanup;
827*a28cd43dSSascha Wildner }
828*a28cd43dSSascha Wildner totalCompressedSize += size;
829*a28cd43dSSascha Wildner }
830*a28cd43dSSascha Wildner _compressCleanup:
831*a28cd43dSSascha Wildner ZSTD_freeCCtx(cctx);
832*a28cd43dSSascha Wildner ZSTD_freeCDict(cdict);
833*a28cd43dSSascha Wildner if (dst) {
834*a28cd43dSSascha Wildner free(dst);
835*a28cd43dSSascha Wildner }
836*a28cd43dSSascha Wildner return totalCompressedSize;
837*a28cd43dSSascha Wildner }
838*a28cd43dSSascha Wildner
839*a28cd43dSSascha Wildner
840*a28cd43dSSascha Wildner /**
841*a28cd43dSSascha Wildner * Initialize the `COVER_best_t`.
842*a28cd43dSSascha Wildner */
COVER_best_init(COVER_best_t * best)843*a28cd43dSSascha Wildner void COVER_best_init(COVER_best_t *best) {
844*a28cd43dSSascha Wildner if (best==NULL) return; /* compatible with init on NULL */
845*a28cd43dSSascha Wildner (void)ZSTD_pthread_mutex_init(&best->mutex, NULL);
846*a28cd43dSSascha Wildner (void)ZSTD_pthread_cond_init(&best->cond, NULL);
847*a28cd43dSSascha Wildner best->liveJobs = 0;
848*a28cd43dSSascha Wildner best->dict = NULL;
849*a28cd43dSSascha Wildner best->dictSize = 0;
850*a28cd43dSSascha Wildner best->compressedSize = (size_t)-1;
851*a28cd43dSSascha Wildner memset(&best->parameters, 0, sizeof(best->parameters));
852*a28cd43dSSascha Wildner }
853*a28cd43dSSascha Wildner
854*a28cd43dSSascha Wildner /**
855*a28cd43dSSascha Wildner * Wait until liveJobs == 0.
856*a28cd43dSSascha Wildner */
COVER_best_wait(COVER_best_t * best)857*a28cd43dSSascha Wildner void COVER_best_wait(COVER_best_t *best) {
858*a28cd43dSSascha Wildner if (!best) {
859*a28cd43dSSascha Wildner return;
860*a28cd43dSSascha Wildner }
861*a28cd43dSSascha Wildner ZSTD_pthread_mutex_lock(&best->mutex);
862*a28cd43dSSascha Wildner while (best->liveJobs != 0) {
863*a28cd43dSSascha Wildner ZSTD_pthread_cond_wait(&best->cond, &best->mutex);
864*a28cd43dSSascha Wildner }
865*a28cd43dSSascha Wildner ZSTD_pthread_mutex_unlock(&best->mutex);
866*a28cd43dSSascha Wildner }
867*a28cd43dSSascha Wildner
868*a28cd43dSSascha Wildner /**
869*a28cd43dSSascha Wildner * Call COVER_best_wait() and then destroy the COVER_best_t.
870*a28cd43dSSascha Wildner */
COVER_best_destroy(COVER_best_t * best)871*a28cd43dSSascha Wildner void COVER_best_destroy(COVER_best_t *best) {
872*a28cd43dSSascha Wildner if (!best) {
873*a28cd43dSSascha Wildner return;
874*a28cd43dSSascha Wildner }
875*a28cd43dSSascha Wildner COVER_best_wait(best);
876*a28cd43dSSascha Wildner if (best->dict) {
877*a28cd43dSSascha Wildner free(best->dict);
878*a28cd43dSSascha Wildner }
879*a28cd43dSSascha Wildner ZSTD_pthread_mutex_destroy(&best->mutex);
880*a28cd43dSSascha Wildner ZSTD_pthread_cond_destroy(&best->cond);
881*a28cd43dSSascha Wildner }
882*a28cd43dSSascha Wildner
883*a28cd43dSSascha Wildner /**
884*a28cd43dSSascha Wildner * Called when a thread is about to be launched.
885*a28cd43dSSascha Wildner * Increments liveJobs.
886*a28cd43dSSascha Wildner */
COVER_best_start(COVER_best_t * best)887*a28cd43dSSascha Wildner void COVER_best_start(COVER_best_t *best) {
888*a28cd43dSSascha Wildner if (!best) {
889*a28cd43dSSascha Wildner return;
890*a28cd43dSSascha Wildner }
891*a28cd43dSSascha Wildner ZSTD_pthread_mutex_lock(&best->mutex);
892*a28cd43dSSascha Wildner ++best->liveJobs;
893*a28cd43dSSascha Wildner ZSTD_pthread_mutex_unlock(&best->mutex);
894*a28cd43dSSascha Wildner }
895*a28cd43dSSascha Wildner
896*a28cd43dSSascha Wildner /**
897*a28cd43dSSascha Wildner * Called when a thread finishes executing, both on error or success.
898*a28cd43dSSascha Wildner * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
899*a28cd43dSSascha Wildner * If this dictionary is the best so far save it and its parameters.
900*a28cd43dSSascha Wildner */
COVER_best_finish(COVER_best_t * best,ZDICT_cover_params_t parameters,COVER_dictSelection_t selection)901*a28cd43dSSascha Wildner void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
902*a28cd43dSSascha Wildner COVER_dictSelection_t selection) {
903*a28cd43dSSascha Wildner void* dict = selection.dictContent;
904*a28cd43dSSascha Wildner size_t compressedSize = selection.totalCompressedSize;
905*a28cd43dSSascha Wildner size_t dictSize = selection.dictSize;
906*a28cd43dSSascha Wildner if (!best) {
907*a28cd43dSSascha Wildner return;
908*a28cd43dSSascha Wildner }
909*a28cd43dSSascha Wildner {
910*a28cd43dSSascha Wildner size_t liveJobs;
911*a28cd43dSSascha Wildner ZSTD_pthread_mutex_lock(&best->mutex);
912*a28cd43dSSascha Wildner --best->liveJobs;
913*a28cd43dSSascha Wildner liveJobs = best->liveJobs;
914*a28cd43dSSascha Wildner /* If the new dictionary is better */
915*a28cd43dSSascha Wildner if (compressedSize < best->compressedSize) {
916*a28cd43dSSascha Wildner /* Allocate space if necessary */
917*a28cd43dSSascha Wildner if (!best->dict || best->dictSize < dictSize) {
918*a28cd43dSSascha Wildner if (best->dict) {
919*a28cd43dSSascha Wildner free(best->dict);
920*a28cd43dSSascha Wildner }
921*a28cd43dSSascha Wildner best->dict = malloc(dictSize);
922*a28cd43dSSascha Wildner if (!best->dict) {
923*a28cd43dSSascha Wildner best->compressedSize = ERROR(GENERIC);
924*a28cd43dSSascha Wildner best->dictSize = 0;
925*a28cd43dSSascha Wildner ZSTD_pthread_cond_signal(&best->cond);
926*a28cd43dSSascha Wildner ZSTD_pthread_mutex_unlock(&best->mutex);
927*a28cd43dSSascha Wildner return;
928*a28cd43dSSascha Wildner }
929*a28cd43dSSascha Wildner }
930*a28cd43dSSascha Wildner /* Save the dictionary, parameters, and size */
931*a28cd43dSSascha Wildner if (dict) {
932*a28cd43dSSascha Wildner memcpy(best->dict, dict, dictSize);
933*a28cd43dSSascha Wildner best->dictSize = dictSize;
934*a28cd43dSSascha Wildner best->parameters = parameters;
935*a28cd43dSSascha Wildner best->compressedSize = compressedSize;
936*a28cd43dSSascha Wildner }
937*a28cd43dSSascha Wildner }
938*a28cd43dSSascha Wildner if (liveJobs == 0) {
939*a28cd43dSSascha Wildner ZSTD_pthread_cond_broadcast(&best->cond);
940*a28cd43dSSascha Wildner }
941*a28cd43dSSascha Wildner ZSTD_pthread_mutex_unlock(&best->mutex);
942*a28cd43dSSascha Wildner }
943*a28cd43dSSascha Wildner }
944*a28cd43dSSascha Wildner
COVER_dictSelectionError(size_t error)945*a28cd43dSSascha Wildner COVER_dictSelection_t COVER_dictSelectionError(size_t error) {
946*a28cd43dSSascha Wildner COVER_dictSelection_t selection = { NULL, 0, error };
947*a28cd43dSSascha Wildner return selection;
948*a28cd43dSSascha Wildner }
949*a28cd43dSSascha Wildner
COVER_dictSelectionIsError(COVER_dictSelection_t selection)950*a28cd43dSSascha Wildner unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection) {
951*a28cd43dSSascha Wildner return (ZSTD_isError(selection.totalCompressedSize) || !selection.dictContent);
952*a28cd43dSSascha Wildner }
953*a28cd43dSSascha Wildner
COVER_dictSelectionFree(COVER_dictSelection_t selection)954*a28cd43dSSascha Wildner void COVER_dictSelectionFree(COVER_dictSelection_t selection){
955*a28cd43dSSascha Wildner free(selection.dictContent);
956*a28cd43dSSascha Wildner }
957*a28cd43dSSascha Wildner
COVER_selectDict(BYTE * customDictContent,size_t dictBufferCapacity,size_t dictContentSize,const BYTE * samplesBuffer,const size_t * samplesSizes,unsigned nbFinalizeSamples,size_t nbCheckSamples,size_t nbSamples,ZDICT_cover_params_t params,size_t * offsets,size_t totalCompressedSize)958*a28cd43dSSascha Wildner COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity,
959*a28cd43dSSascha Wildner size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
960*a28cd43dSSascha Wildner size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize) {
961*a28cd43dSSascha Wildner
962*a28cd43dSSascha Wildner size_t largestDict = 0;
963*a28cd43dSSascha Wildner size_t largestCompressed = 0;
964*a28cd43dSSascha Wildner BYTE* customDictContentEnd = customDictContent + dictContentSize;
965*a28cd43dSSascha Wildner
966*a28cd43dSSascha Wildner BYTE * largestDictbuffer = (BYTE *)malloc(dictBufferCapacity);
967*a28cd43dSSascha Wildner BYTE * candidateDictBuffer = (BYTE *)malloc(dictBufferCapacity);
968*a28cd43dSSascha Wildner double regressionTolerance = ((double)params.shrinkDictMaxRegression / 100.0) + 1.00;
969*a28cd43dSSascha Wildner
970*a28cd43dSSascha Wildner if (!largestDictbuffer || !candidateDictBuffer) {
971*a28cd43dSSascha Wildner free(largestDictbuffer);
972*a28cd43dSSascha Wildner free(candidateDictBuffer);
973*a28cd43dSSascha Wildner return COVER_dictSelectionError(dictContentSize);
974*a28cd43dSSascha Wildner }
975*a28cd43dSSascha Wildner
976*a28cd43dSSascha Wildner /* Initial dictionary size and compressed size */
977*a28cd43dSSascha Wildner memcpy(largestDictbuffer, customDictContent, dictContentSize);
978*a28cd43dSSascha Wildner dictContentSize = ZDICT_finalizeDictionary(
979*a28cd43dSSascha Wildner largestDictbuffer, dictBufferCapacity, customDictContent, dictContentSize,
980*a28cd43dSSascha Wildner samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
981*a28cd43dSSascha Wildner
982*a28cd43dSSascha Wildner if (ZDICT_isError(dictContentSize)) {
983*a28cd43dSSascha Wildner free(largestDictbuffer);
984*a28cd43dSSascha Wildner free(candidateDictBuffer);
985*a28cd43dSSascha Wildner return COVER_dictSelectionError(dictContentSize);
986*a28cd43dSSascha Wildner }
987*a28cd43dSSascha Wildner
988*a28cd43dSSascha Wildner totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
989*a28cd43dSSascha Wildner samplesBuffer, offsets,
990*a28cd43dSSascha Wildner nbCheckSamples, nbSamples,
991*a28cd43dSSascha Wildner largestDictbuffer, dictContentSize);
992*a28cd43dSSascha Wildner
993*a28cd43dSSascha Wildner if (ZSTD_isError(totalCompressedSize)) {
994*a28cd43dSSascha Wildner free(largestDictbuffer);
995*a28cd43dSSascha Wildner free(candidateDictBuffer);
996*a28cd43dSSascha Wildner return COVER_dictSelectionError(totalCompressedSize);
997*a28cd43dSSascha Wildner }
998*a28cd43dSSascha Wildner
999*a28cd43dSSascha Wildner if (params.shrinkDict == 0) {
1000*a28cd43dSSascha Wildner COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
1001*a28cd43dSSascha Wildner free(candidateDictBuffer);
1002*a28cd43dSSascha Wildner return selection;
1003*a28cd43dSSascha Wildner }
1004*a28cd43dSSascha Wildner
1005*a28cd43dSSascha Wildner largestDict = dictContentSize;
1006*a28cd43dSSascha Wildner largestCompressed = totalCompressedSize;
1007*a28cd43dSSascha Wildner dictContentSize = ZDICT_DICTSIZE_MIN;
1008*a28cd43dSSascha Wildner
1009*a28cd43dSSascha Wildner /* Largest dict is initially at least ZDICT_DICTSIZE_MIN */
1010*a28cd43dSSascha Wildner while (dictContentSize < largestDict) {
1011*a28cd43dSSascha Wildner memcpy(candidateDictBuffer, largestDictbuffer, largestDict);
1012*a28cd43dSSascha Wildner dictContentSize = ZDICT_finalizeDictionary(
1013*a28cd43dSSascha Wildner candidateDictBuffer, dictBufferCapacity, customDictContentEnd - dictContentSize, dictContentSize,
1014*a28cd43dSSascha Wildner samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
1015*a28cd43dSSascha Wildner
1016*a28cd43dSSascha Wildner if (ZDICT_isError(dictContentSize)) {
1017*a28cd43dSSascha Wildner free(largestDictbuffer);
1018*a28cd43dSSascha Wildner free(candidateDictBuffer);
1019*a28cd43dSSascha Wildner return COVER_dictSelectionError(dictContentSize);
1020*a28cd43dSSascha Wildner
1021*a28cd43dSSascha Wildner }
1022*a28cd43dSSascha Wildner
1023*a28cd43dSSascha Wildner totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
1024*a28cd43dSSascha Wildner samplesBuffer, offsets,
1025*a28cd43dSSascha Wildner nbCheckSamples, nbSamples,
1026*a28cd43dSSascha Wildner candidateDictBuffer, dictContentSize);
1027*a28cd43dSSascha Wildner
1028*a28cd43dSSascha Wildner if (ZSTD_isError(totalCompressedSize)) {
1029*a28cd43dSSascha Wildner free(largestDictbuffer);
1030*a28cd43dSSascha Wildner free(candidateDictBuffer);
1031*a28cd43dSSascha Wildner return COVER_dictSelectionError(totalCompressedSize);
1032*a28cd43dSSascha Wildner }
1033*a28cd43dSSascha Wildner
1034*a28cd43dSSascha Wildner if (totalCompressedSize <= largestCompressed * regressionTolerance) {
1035*a28cd43dSSascha Wildner COVER_dictSelection_t selection = { candidateDictBuffer, dictContentSize, totalCompressedSize };
1036*a28cd43dSSascha Wildner free(largestDictbuffer);
1037*a28cd43dSSascha Wildner return selection;
1038*a28cd43dSSascha Wildner }
1039*a28cd43dSSascha Wildner dictContentSize *= 2;
1040*a28cd43dSSascha Wildner }
1041*a28cd43dSSascha Wildner dictContentSize = largestDict;
1042*a28cd43dSSascha Wildner totalCompressedSize = largestCompressed;
1043*a28cd43dSSascha Wildner {
1044*a28cd43dSSascha Wildner COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
1045*a28cd43dSSascha Wildner free(candidateDictBuffer);
1046*a28cd43dSSascha Wildner return selection;
1047*a28cd43dSSascha Wildner }
1048*a28cd43dSSascha Wildner }
1049*a28cd43dSSascha Wildner
1050*a28cd43dSSascha Wildner /**
1051*a28cd43dSSascha Wildner * Parameters for COVER_tryParameters().
1052*a28cd43dSSascha Wildner */
1053*a28cd43dSSascha Wildner typedef struct COVER_tryParameters_data_s {
1054*a28cd43dSSascha Wildner const COVER_ctx_t *ctx;
1055*a28cd43dSSascha Wildner COVER_best_t *best;
1056*a28cd43dSSascha Wildner size_t dictBufferCapacity;
1057*a28cd43dSSascha Wildner ZDICT_cover_params_t parameters;
1058*a28cd43dSSascha Wildner } COVER_tryParameters_data_t;
1059*a28cd43dSSascha Wildner
1060*a28cd43dSSascha Wildner /**
1061*a28cd43dSSascha Wildner * Tries a set of parameters and updates the COVER_best_t with the results.
1062*a28cd43dSSascha Wildner * This function is thread safe if zstd is compiled with multithreaded support.
1063*a28cd43dSSascha Wildner * It takes its parameters as an *OWNING* opaque pointer to support threading.
1064*a28cd43dSSascha Wildner */
COVER_tryParameters(void * opaque)1065*a28cd43dSSascha Wildner static void COVER_tryParameters(void *opaque) {
1066*a28cd43dSSascha Wildner /* Save parameters as local variables */
1067*a28cd43dSSascha Wildner COVER_tryParameters_data_t *const data = (COVER_tryParameters_data_t *)opaque;
1068*a28cd43dSSascha Wildner const COVER_ctx_t *const ctx = data->ctx;
1069*a28cd43dSSascha Wildner const ZDICT_cover_params_t parameters = data->parameters;
1070*a28cd43dSSascha Wildner size_t dictBufferCapacity = data->dictBufferCapacity;
1071*a28cd43dSSascha Wildner size_t totalCompressedSize = ERROR(GENERIC);
1072*a28cd43dSSascha Wildner /* Allocate space for hash table, dict, and freqs */
1073*a28cd43dSSascha Wildner COVER_map_t activeDmers;
1074*a28cd43dSSascha Wildner BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
1075*a28cd43dSSascha Wildner COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
1076*a28cd43dSSascha Wildner U32 *freqs = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
1077*a28cd43dSSascha Wildner if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
1078*a28cd43dSSascha Wildner DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
1079*a28cd43dSSascha Wildner goto _cleanup;
1080*a28cd43dSSascha Wildner }
1081*a28cd43dSSascha Wildner if (!dict || !freqs) {
1082*a28cd43dSSascha Wildner DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
1083*a28cd43dSSascha Wildner goto _cleanup;
1084*a28cd43dSSascha Wildner }
1085*a28cd43dSSascha Wildner /* Copy the frequencies because we need to modify them */
1086*a28cd43dSSascha Wildner memcpy(freqs, ctx->freqs, ctx->suffixSize * sizeof(U32));
1087*a28cd43dSSascha Wildner /* Build the dictionary */
1088*a28cd43dSSascha Wildner {
1089*a28cd43dSSascha Wildner const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict,
1090*a28cd43dSSascha Wildner dictBufferCapacity, parameters);
1091*a28cd43dSSascha Wildner selection = COVER_selectDict(dict + tail, dictBufferCapacity, dictBufferCapacity - tail,
1092*a28cd43dSSascha Wildner ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
1093*a28cd43dSSascha Wildner totalCompressedSize);
1094*a28cd43dSSascha Wildner
1095*a28cd43dSSascha Wildner if (COVER_dictSelectionIsError(selection)) {
1096*a28cd43dSSascha Wildner DISPLAYLEVEL(1, "Failed to select dictionary\n");
1097*a28cd43dSSascha Wildner goto _cleanup;
1098*a28cd43dSSascha Wildner }
1099*a28cd43dSSascha Wildner }
1100*a28cd43dSSascha Wildner _cleanup:
1101*a28cd43dSSascha Wildner free(dict);
1102*a28cd43dSSascha Wildner COVER_best_finish(data->best, parameters, selection);
1103*a28cd43dSSascha Wildner free(data);
1104*a28cd43dSSascha Wildner COVER_map_destroy(&activeDmers);
1105*a28cd43dSSascha Wildner COVER_dictSelectionFree(selection);
1106*a28cd43dSSascha Wildner if (freqs) {
1107*a28cd43dSSascha Wildner free(freqs);
1108*a28cd43dSSascha Wildner }
1109*a28cd43dSSascha Wildner }
1110*a28cd43dSSascha Wildner
ZDICT_optimizeTrainFromBuffer_cover(void * dictBuffer,size_t dictBufferCapacity,const void * samplesBuffer,const size_t * samplesSizes,unsigned nbSamples,ZDICT_cover_params_t * parameters)1111*a28cd43dSSascha Wildner ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
1112*a28cd43dSSascha Wildner void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
1113*a28cd43dSSascha Wildner const size_t *samplesSizes, unsigned nbSamples,
1114*a28cd43dSSascha Wildner ZDICT_cover_params_t *parameters) {
1115*a28cd43dSSascha Wildner /* constants */
1116*a28cd43dSSascha Wildner const unsigned nbThreads = parameters->nbThreads;
1117*a28cd43dSSascha Wildner const double splitPoint =
1118*a28cd43dSSascha Wildner parameters->splitPoint <= 0.0 ? COVER_DEFAULT_SPLITPOINT : parameters->splitPoint;
1119*a28cd43dSSascha Wildner const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
1120*a28cd43dSSascha Wildner const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
1121*a28cd43dSSascha Wildner const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
1122*a28cd43dSSascha Wildner const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;
1123*a28cd43dSSascha Wildner const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps;
1124*a28cd43dSSascha Wildner const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
1125*a28cd43dSSascha Wildner const unsigned kIterations =
1126*a28cd43dSSascha Wildner (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
1127*a28cd43dSSascha Wildner const unsigned shrinkDict = 0;
1128*a28cd43dSSascha Wildner /* Local variables */
1129*a28cd43dSSascha Wildner const int displayLevel = parameters->zParams.notificationLevel;
1130*a28cd43dSSascha Wildner unsigned iteration = 1;
1131*a28cd43dSSascha Wildner unsigned d;
1132*a28cd43dSSascha Wildner unsigned k;
1133*a28cd43dSSascha Wildner COVER_best_t best;
1134*a28cd43dSSascha Wildner POOL_ctx *pool = NULL;
1135*a28cd43dSSascha Wildner int warned = 0;
1136*a28cd43dSSascha Wildner
1137*a28cd43dSSascha Wildner /* Checks */
1138*a28cd43dSSascha Wildner if (splitPoint <= 0 || splitPoint > 1) {
1139*a28cd43dSSascha Wildner LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
1140*a28cd43dSSascha Wildner return ERROR(parameter_outOfBound);
1141*a28cd43dSSascha Wildner }
1142*a28cd43dSSascha Wildner if (kMinK < kMaxD || kMaxK < kMinK) {
1143*a28cd43dSSascha Wildner LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
1144*a28cd43dSSascha Wildner return ERROR(parameter_outOfBound);
1145*a28cd43dSSascha Wildner }
1146*a28cd43dSSascha Wildner if (nbSamples == 0) {
1147*a28cd43dSSascha Wildner DISPLAYLEVEL(1, "Cover must have at least one input file\n");
1148*a28cd43dSSascha Wildner return ERROR(srcSize_wrong);
1149*a28cd43dSSascha Wildner }
1150*a28cd43dSSascha Wildner if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
1151*a28cd43dSSascha Wildner DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
1152*a28cd43dSSascha Wildner ZDICT_DICTSIZE_MIN);
1153*a28cd43dSSascha Wildner return ERROR(dstSize_tooSmall);
1154*a28cd43dSSascha Wildner }
1155*a28cd43dSSascha Wildner if (nbThreads > 1) {
1156*a28cd43dSSascha Wildner pool = POOL_create(nbThreads, 1);
1157*a28cd43dSSascha Wildner if (!pool) {
1158*a28cd43dSSascha Wildner return ERROR(memory_allocation);
1159*a28cd43dSSascha Wildner }
1160*a28cd43dSSascha Wildner }
1161*a28cd43dSSascha Wildner /* Initialization */
1162*a28cd43dSSascha Wildner COVER_best_init(&best);
1163*a28cd43dSSascha Wildner /* Turn down global display level to clean up display at level 2 and below */
1164*a28cd43dSSascha Wildner g_displayLevel = displayLevel == 0 ? 0 : displayLevel - 1;
1165*a28cd43dSSascha Wildner /* Loop through d first because each new value needs a new context */
1166*a28cd43dSSascha Wildner LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",
1167*a28cd43dSSascha Wildner kIterations);
1168*a28cd43dSSascha Wildner for (d = kMinD; d <= kMaxD; d += 2) {
1169*a28cd43dSSascha Wildner /* Initialize the context for this value of d */
1170*a28cd43dSSascha Wildner COVER_ctx_t ctx;
1171*a28cd43dSSascha Wildner LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
1172*a28cd43dSSascha Wildner {
1173*a28cd43dSSascha Wildner const size_t initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint);
1174*a28cd43dSSascha Wildner if (ZSTD_isError(initVal)) {
1175*a28cd43dSSascha Wildner LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
1176*a28cd43dSSascha Wildner COVER_best_destroy(&best);
1177*a28cd43dSSascha Wildner POOL_free(pool);
1178*a28cd43dSSascha Wildner return initVal;
1179*a28cd43dSSascha Wildner }
1180*a28cd43dSSascha Wildner }
1181*a28cd43dSSascha Wildner if (!warned) {
1182*a28cd43dSSascha Wildner COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, displayLevel);
1183*a28cd43dSSascha Wildner warned = 1;
1184*a28cd43dSSascha Wildner }
1185*a28cd43dSSascha Wildner /* Loop through k reusing the same context */
1186*a28cd43dSSascha Wildner for (k = kMinK; k <= kMaxK; k += kStepSize) {
1187*a28cd43dSSascha Wildner /* Prepare the arguments */
1188*a28cd43dSSascha Wildner COVER_tryParameters_data_t *data = (COVER_tryParameters_data_t *)malloc(
1189*a28cd43dSSascha Wildner sizeof(COVER_tryParameters_data_t));
1190*a28cd43dSSascha Wildner LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k);
1191*a28cd43dSSascha Wildner if (!data) {
1192*a28cd43dSSascha Wildner LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n");
1193*a28cd43dSSascha Wildner COVER_best_destroy(&best);
1194*a28cd43dSSascha Wildner COVER_ctx_destroy(&ctx);
1195*a28cd43dSSascha Wildner POOL_free(pool);
1196*a28cd43dSSascha Wildner return ERROR(memory_allocation);
1197*a28cd43dSSascha Wildner }
1198*a28cd43dSSascha Wildner data->ctx = &ctx;
1199*a28cd43dSSascha Wildner data->best = &best;
1200*a28cd43dSSascha Wildner data->dictBufferCapacity = dictBufferCapacity;
1201*a28cd43dSSascha Wildner data->parameters = *parameters;
1202*a28cd43dSSascha Wildner data->parameters.k = k;
1203*a28cd43dSSascha Wildner data->parameters.d = d;
1204*a28cd43dSSascha Wildner data->parameters.splitPoint = splitPoint;
1205*a28cd43dSSascha Wildner data->parameters.steps = kSteps;
1206*a28cd43dSSascha Wildner data->parameters.shrinkDict = shrinkDict;
1207*a28cd43dSSascha Wildner data->parameters.zParams.notificationLevel = g_displayLevel;
1208*a28cd43dSSascha Wildner /* Check the parameters */
1209*a28cd43dSSascha Wildner if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) {
1210*a28cd43dSSascha Wildner DISPLAYLEVEL(1, "Cover parameters incorrect\n");
1211*a28cd43dSSascha Wildner free(data);
1212*a28cd43dSSascha Wildner continue;
1213*a28cd43dSSascha Wildner }
1214*a28cd43dSSascha Wildner /* Call the function and pass ownership of data to it */
1215*a28cd43dSSascha Wildner COVER_best_start(&best);
1216*a28cd43dSSascha Wildner if (pool) {
1217*a28cd43dSSascha Wildner POOL_add(pool, &COVER_tryParameters, data);
1218*a28cd43dSSascha Wildner } else {
1219*a28cd43dSSascha Wildner COVER_tryParameters(data);
1220*a28cd43dSSascha Wildner }
1221*a28cd43dSSascha Wildner /* Print status */
1222*a28cd43dSSascha Wildner LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%% ",
1223*a28cd43dSSascha Wildner (unsigned)((iteration * 100) / kIterations));
1224*a28cd43dSSascha Wildner ++iteration;
1225*a28cd43dSSascha Wildner }
1226*a28cd43dSSascha Wildner COVER_best_wait(&best);
1227*a28cd43dSSascha Wildner COVER_ctx_destroy(&ctx);
1228*a28cd43dSSascha Wildner }
1229*a28cd43dSSascha Wildner LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", "");
1230*a28cd43dSSascha Wildner /* Fill the output buffer and parameters with output of the best parameters */
1231*a28cd43dSSascha Wildner {
1232*a28cd43dSSascha Wildner const size_t dictSize = best.dictSize;
1233*a28cd43dSSascha Wildner if (ZSTD_isError(best.compressedSize)) {
1234*a28cd43dSSascha Wildner const size_t compressedSize = best.compressedSize;
1235*a28cd43dSSascha Wildner COVER_best_destroy(&best);
1236*a28cd43dSSascha Wildner POOL_free(pool);
1237*a28cd43dSSascha Wildner return compressedSize;
1238*a28cd43dSSascha Wildner }
1239*a28cd43dSSascha Wildner *parameters = best.parameters;
1240*a28cd43dSSascha Wildner memcpy(dictBuffer, best.dict, dictSize);
1241*a28cd43dSSascha Wildner COVER_best_destroy(&best);
1242*a28cd43dSSascha Wildner POOL_free(pool);
1243*a28cd43dSSascha Wildner return dictSize;
1244*a28cd43dSSascha Wildner }
1245*a28cd43dSSascha Wildner }
1246