1 /**********************************************************************
2 Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7 * Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 * Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 * Neither the name of Intel Corporation nor the names of its
14 contributors may be used to endorse or promote products derived
15 from this software without specific prior written permission.
16
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 **********************************************************************/
29
30 /* This program can be used to generate custom a custom huffman encoding to get
31 * better data compression. This is most useful when the type of data being
32 * compressed is well known.
33 *
34 * To use generate_custom_hufftables, pass a sequence of files to the program
35 * that together form an accurate representation of the data that is being
36 * compressed. Generate_custom_hufftables will then produce the file
37 * hufftables_c.c, which should be moved to replace its counterpart in the igzip
38 * source folder. After recompiling the Isa-l library, the igzip compression
39 * functions will use the new hufftables.
40 *
41 * Generate_custom_hufftables should be compiled with the same compile time
42 * parameters as the igzip source code. Generating custom hufftables with
43 * different compile time parameters may cause igzip to produce invalid output
44 * for the reasons described below. The default parameters used by
45 * generate_custom_hufftables are the same as the default parameters used by
46 * igzip.
47 *
48 * *WARNING* generate custom hufftables must be compiled with a IGZIP_HIST_SIZE
49 * that is at least as large as the IGZIP_HIST_SIZE used by igzip. By default
50 * IGZIP_HIST_SIZE is 32K, the maximum usable IGZIP_HIST_SIZE is 32K. The reason
51 * for this is to generate better compression. Igzip cannot produce look back
52 * distances with sizes larger than the IGZIP_HIST_SIZE igzip was compiled with,
53 * so look back distances with sizes larger than IGZIP_HIST_SIZE are not
54 * assigned a huffman code. The definition of LONGER_HUFFTABLES must be
55 * consistent as well since that definition changes the size of the structures
56 * printed by this tool.
57 *
58 */
59
60 #include <stdint.h>
61 #include <stdio.h>
62 #include <inttypes.h>
63 #include <string.h>
64 #include <stdlib.h>
65 #include "igzip_lib.h"
66
67 #include "huff_codes.h"
68 #include "huffman.h"
69
70 /*These max code lengths are limited by how the data is stored in
71 * hufftables.asm. The deflate standard max is 15.*/
72
73 #define MAX_HEADER_SIZE ISAL_DEF_MAX_HDR_SIZE
74
75 #define GZIP_HEADER_SIZE 10
76 #define GZIP_TRAILER_SIZE 8
77 #define ZLIB_HEADER_SIZE 2
78 #define ZLIB_TRAILER_SIZE 4
79
80 /**
81 * @brief Prints a table of uint8_t elements to a file.
82 * @param outfile: the file the table is printed to.
83 * @param table: the table to be printed.
84 * @param length: number of elements to be printed.
85 * @param header: header to append in front of the table.
86 * @param footer: footer to append at the end of the table.
87 * @param begin_line: string printed at beginning of new line
88 */
89 void
fprint_uint8_table(FILE * outfile,uint8_t * table,uint64_t length,char * header,char * footer,char * begin_line)90 fprint_uint8_table(FILE *outfile, uint8_t *table, uint64_t length, char *header, char *footer,
91 char *begin_line)
92 {
93 int i;
94 fprintf(outfile, "%s", header);
95 for (i = 0; i < length - 1; i++) {
96 if ((i & 7) == 0)
97 fprintf(outfile, "\n%s", begin_line);
98 else
99 fprintf(outfile, " ");
100 fprintf(outfile, "0x%02x,", table[i]);
101 }
102
103 if ((i & 7) == 0)
104 fprintf(outfile, "\n%s", begin_line);
105 else
106 fprintf(outfile, " ");
107 fprintf(outfile, "0x%02x", table[i]);
108 fprintf(outfile, "%s", footer);
109 }
110
111 /**
112 * @brief Prints a table of uint16_t elements to a file.
113 * @param outfile: the file the table is printed to.
114 * @param table: the table to be printed.
115 * @param length: number of elements to be printed.
116 * @param header: header to append in front of the table.
117 * @param footer: footer to append at the end of the table.
118 * @param begin_line: string printed at beginning of new line
119 */
120 void
fprint_uint16_table(FILE * outfile,uint16_t * table,uint64_t length,char * header,char * footer,char * begin_line)121 fprint_uint16_table(FILE *outfile, uint16_t *table, uint64_t length, char *header, char *footer,
122 char *begin_line)
123 {
124 int i;
125 fprintf(outfile, "%s", header);
126 for (i = 0; i < length - 1; i++) {
127 if ((i & 7) == 0)
128 fprintf(outfile, "\n%s", begin_line);
129 else
130 fprintf(outfile, " ");
131 fprintf(outfile, "0x%04x,", table[i]);
132 }
133
134 if ((i & 7) == 0)
135 fprintf(outfile, "\n%s", begin_line);
136 else
137 fprintf(outfile, " ");
138 fprintf(outfile, "0x%04x", table[i]);
139 fprintf(outfile, "%s", footer);
140 }
141
142 /**
143 * @brief Prints a table of uint32_t elements to a file.
144 * @param outfile: the file the table is printed to.
145 * @param table: the table to be printed.
146 * @param length: number of elements to be printed.
147 * @param header: header to append in front of the table.
148 * @param footer: footer to append at the end of the table.
149 * @param begin_line: string printed at beginning of new line
150 */
151 void
fprint_uint32_table(FILE * outfile,uint32_t * table,uint64_t length,char * header,char * footer,char * begin_line)152 fprint_uint32_table(FILE *outfile, uint32_t *table, uint64_t length, char *header, char *footer,
153 char *begin_line)
154 {
155 int i;
156 fprintf(outfile, "%s", header);
157 for (i = 0; i < length - 1; i++) {
158 if ((i & 3) == 0)
159 fprintf(outfile, "\n%s", begin_line);
160 else
161 fprintf(outfile, " ");
162 fprintf(outfile, "0x%08x,", table[i]);
163 }
164
165 if ((i & 3) == 0)
166 fprintf(outfile, "%s", begin_line);
167 else
168 fprintf(outfile, " ");
169 fprintf(outfile, "0x%08x", table[i]);
170 fprintf(outfile, "%s", footer);
171 }
172
173 void
fprint_hufftables(FILE * output_file,char * hufftables_name,struct isal_hufftables * hufftables)174 fprint_hufftables(FILE *output_file, char *hufftables_name, struct isal_hufftables *hufftables)
175 {
176 fprintf(output_file, "struct isal_hufftables %s = {\n\n", hufftables_name);
177
178 fprint_uint8_table(output_file, hufftables->deflate_hdr,
179 hufftables->deflate_hdr_count +
180 (hufftables->deflate_hdr_extra_bits + 7) / 8,
181 "\t.deflate_hdr = {", "},\n\n", "\t\t");
182
183 fprintf(output_file, "\t.deflate_hdr_count = %d,\n", hufftables->deflate_hdr_count);
184 fprintf(output_file, "\t.deflate_hdr_extra_bits = %d,\n\n",
185 hufftables->deflate_hdr_extra_bits);
186
187 fprint_uint32_table(output_file, hufftables->dist_table, IGZIP_DIST_TABLE_SIZE,
188 "\t.dist_table = {", "},\n\n", "\t\t");
189
190 fprint_uint32_table(output_file, hufftables->len_table, IGZIP_LEN_TABLE_SIZE,
191 "\t.len_table = {", "},\n\n", "\t\t");
192
193 fprint_uint16_table(output_file, hufftables->lit_table, IGZIP_LIT_TABLE_SIZE,
194 "\t.lit_table = {", "},\n\n", "\t\t");
195 fprint_uint8_table(output_file, hufftables->lit_table_sizes, IGZIP_LIT_TABLE_SIZE,
196 "\t.lit_table_sizes = {", "},\n\n", "\t\t");
197
198 fprint_uint16_table(output_file, hufftables->dcodes,
199 ISAL_DEF_DIST_SYMBOLS - IGZIP_DECODE_OFFSET, "\t.dcodes = {", "},\n\n",
200 "\t\t");
201 fprint_uint8_table(output_file, hufftables->dcodes_sizes,
202 ISAL_DEF_DIST_SYMBOLS - IGZIP_DECODE_OFFSET, "\t.dcodes_sizes = {",
203 "}\n", "\t\t");
204 fprintf(output_file, "};\n");
205 }
206
207 void
fprint_header(FILE * output_file)208 fprint_header(FILE *output_file)
209 {
210
211 fprintf(output_file, "#include <stdint.h>\n");
212 fprintf(output_file, "#include <igzip_lib.h>\n\n");
213
214 fprintf(output_file,
215 "#if IGZIP_HIST_SIZE > %d\n"
216 "# error \"Invalid history size for the custom hufftable\"\n"
217 "#endif\n",
218 IGZIP_HIST_SIZE);
219
220 #ifdef LONGER_HUFFTABLE
221 fprintf(output_file,
222 "#ifndef LONGER_HUFFTABLE\n"
223 "# error \"Custom hufftable requires LONGER_HUFFTABLE to be defined \"\n"
224 "#endif\n");
225 #else
226 fprintf(output_file,
227 "#ifdef LONGER_HUFFTABLE\n"
228 "# error \"Custom hufftable requires LONGER_HUFFTABLE to not be defined \"\n"
229 "#endif\n");
230 #endif
231 fprintf(output_file, "\n");
232
233 fprintf(output_file, "const uint8_t gzip_hdr[] = {\n"
234 "\t0x1f, 0x8b, 0x08, 0x00, 0x00,\n"
235 "\t0x00, 0x00, 0x00, 0x00, 0xff\t};\n\n");
236
237 fprintf(output_file, "const uint32_t gzip_hdr_bytes = %d;\n", GZIP_HEADER_SIZE);
238 fprintf(output_file, "const uint32_t gzip_trl_bytes = %d;\n\n", GZIP_TRAILER_SIZE);
239
240 fprintf(output_file, "const uint8_t zlib_hdr[] = { 0x78, 0x01 };\n\n");
241 fprintf(output_file, "const uint32_t zlib_hdr_bytes = %d;\n", ZLIB_HEADER_SIZE);
242 fprintf(output_file, "const uint32_t zlib_trl_bytes = %d;\n", ZLIB_TRAILER_SIZE);
243 }
244
245 static uint32_t
convert_dist_to_dist_sym(uint32_t dist)246 convert_dist_to_dist_sym(uint32_t dist)
247 {
248 assert(dist <= 32768 && dist > 0);
249 if (dist <= 32768) {
250 uint32_t msb = dist > 4 ? bsr(dist - 1) - 2 : 0;
251 return (msb * 2) + ((dist - 1) >> msb);
252 } else {
253 return ~0;
254 }
255 }
256
257 /**
258 * @brief Returns the deflate symbol value for a repeat length.
259 */
260 static uint32_t
convert_length_to_len_sym(uint32_t length)261 convert_length_to_len_sym(uint32_t length)
262 {
263 assert(length > 2 && length < 259);
264
265 /* Based on tables on page 11 in RFC 1951 */
266 if (length < 11)
267 return 257 + length - 3;
268 else if (length < 19)
269 return 261 + (length - 3) / 2;
270 else if (length < 35)
271 return 265 + (length - 3) / 4;
272 else if (length < 67)
273 return 269 + (length - 3) / 8;
274 else if (length < 131)
275 return 273 + (length - 3) / 16;
276 else if (length < 258)
277 return 277 + (length - 3) / 32;
278 else
279 return 285;
280 }
281
282 void
isal_update_histogram_dict(uint8_t * start_stream,int dict_length,int length,struct isal_huff_histogram * histogram)283 isal_update_histogram_dict(uint8_t *start_stream, int dict_length, int length,
284 struct isal_huff_histogram *histogram)
285 {
286 uint32_t literal = 0, hash;
287 uint16_t seen, *last_seen = histogram->hash_table;
288 uint8_t *current, *end_stream, *next_hash, *end, *end_dict;
289 uint32_t match_length;
290 uint32_t dist;
291 uint64_t *lit_len_histogram = histogram->lit_len_histogram;
292 uint64_t *dist_histogram = histogram->dist_histogram;
293
294 if (length <= 0)
295 return;
296
297 end_stream = start_stream + dict_length + length;
298 end_dict = start_stream + dict_length;
299
300 memset(last_seen, 0, sizeof(histogram->hash_table)); /* Initialize last_seen to be 0. */
301
302 for (current = start_stream; current < end_dict - 4; current++) {
303 literal = load_le_u32(current);
304 hash = compute_hash(literal) & LVL0_HASH_MASK;
305 last_seen[hash] = (current - start_stream) & 0xFFFF;
306 }
307
308 for (current = start_stream + dict_length; current < end_stream - 3; current++) {
309 literal = load_le_u32(current);
310 hash = compute_hash(literal) & LVL0_HASH_MASK;
311 seen = last_seen[hash];
312 last_seen[hash] = (current - start_stream) & 0xFFFF;
313 dist = (current - start_stream - seen) & 0xFFFF;
314 if (dist - 1 < D - 1) {
315 assert(start_stream <= current - dist);
316 match_length = compare258(current - dist, current, end_stream - current);
317 if (match_length >= SHORTEST_MATCH) {
318 next_hash = current;
319 #ifdef ISAL_LIMIT_HASH_UPDATE
320 end = next_hash + 3;
321 #else
322 end = next_hash + match_length;
323 #endif
324 if (end > end_stream - 3)
325 end = end_stream - 3;
326 next_hash++;
327 for (; next_hash < end; next_hash++) {
328 literal = load_le_u32(next_hash);
329 hash = compute_hash(literal) & LVL0_HASH_MASK;
330 last_seen[hash] = (next_hash - start_stream) & 0xFFFF;
331 }
332
333 dist_histogram[convert_dist_to_dist_sym(dist)] += 1;
334 lit_len_histogram[convert_length_to_len_sym(match_length)] += 1;
335 current += match_length - 1;
336 continue;
337 }
338 }
339 lit_len_histogram[literal & 0xFF] += 1;
340 }
341
342 for (; current < end_stream; current++)
343 lit_len_histogram[*current] += 1;
344
345 lit_len_histogram[256] += 1;
346 return;
347 }
348
349 int
main(int argc,char * argv[])350 main(int argc, char *argv[])
351 {
352 long int file_length;
353 int argi = 1;
354 uint8_t *stream = NULL;
355 struct isal_hufftables hufftables;
356 struct isal_huff_histogram histogram;
357 struct isal_zstream tmp_stream;
358 FILE *file = NULL;
359 FILE *dict_file = NULL;
360 FILE *hist_file = NULL;
361 long int dict_file_length = 0;
362 long int hist_file_length = 0;
363 uint8_t *dict_stream = NULL;
364
365 if (argc == 1) {
366 printf("Error, no input file.\n");
367 return 1;
368 }
369
370 if (argc > 3 && argv[1][0] == '-' && argv[1][1] == 'd') {
371 dict_file = fopen(argv[2], "r");
372 if (dict_file == NULL) {
373 printf("File \"%s\" open error!\n", argv[2]);
374 return 1;
375 }
376
377 fseek(dict_file, 0, SEEK_END);
378 dict_file_length = ftell(dict_file);
379 fseek(dict_file, 0, SEEK_SET);
380 dict_file_length -= ftell(dict_file);
381 dict_stream = malloc(dict_file_length);
382 if (dict_stream == NULL) {
383 printf("Failed to allocate memory to read in dictionary file\n");
384 fclose(dict_file);
385 return 1;
386 }
387 if (fread(dict_stream, 1, dict_file_length, dict_file) != dict_file_length) {
388 printf("Error occurred when reading dictionary file");
389 fclose(dict_file);
390 free(dict_stream);
391 return 1;
392 }
393 isal_update_histogram(dict_stream, dict_file_length, &histogram);
394
395 printf("Read %ld bytes of dictionary file %s\n", dict_file_length, argv[2]);
396 argi += 2;
397 fclose(dict_file);
398 free(dict_stream);
399 }
400
401 if ((argc > argi + 1) && argv[argi][0] == '-' && argv[argi][1] == 'h') {
402 hist_file = fopen(argv[argi + 1], "r+");
403 if (hist_file == NULL) {
404 printf("File \"%s\" open error!\n", argv[argi + 1]);
405 return 1;
406 }
407 fseek(hist_file, 0, SEEK_END);
408 hist_file_length = ftell(hist_file);
409 fseek(hist_file, 0, SEEK_SET);
410 hist_file_length -= ftell(hist_file);
411 if (hist_file_length > sizeof(histogram)) {
412 printf("Histogram file too long\n");
413 return 1;
414 }
415 if (fread(&histogram, 1, hist_file_length, hist_file) != hist_file_length) {
416 printf("Error occurred when reading history file");
417 fclose(hist_file);
418 return 1;
419 }
420 fseek(hist_file, 0, SEEK_SET);
421
422 printf("Read %ld bytes of history file %s\n", hist_file_length, argv[argi + 1]);
423 argi += 2;
424 } else
425 memset(&histogram, 0, sizeof(histogram)); /* Initialize histograms. */
426
427 while (argi < argc) {
428 printf("Processing %s\n", argv[argi]);
429 file = fopen(argv[argi], "r");
430 if (file == NULL) {
431 printf("Error opening file\n");
432 return 1;
433 }
434 fseek(file, 0, SEEK_END);
435 file_length = ftell(file);
436 fseek(file, 0, SEEK_SET);
437 file_length -= ftell(file);
438 stream = malloc(file_length + dict_file_length);
439 if (stream == NULL) {
440 printf("Failed to allocate memory to read in file\n");
441 fclose(file);
442 return 1;
443 }
444 if (dict_file_length > 0)
445 memcpy(stream, dict_stream, dict_file_length);
446
447 if (fread(&stream[dict_file_length], 1, file_length, file) != file_length) {
448 printf("Error occurred when reading file");
449 fclose(file);
450 free(stream);
451 return 1;
452 }
453
454 /* Create a histogram of frequency of symbols found in stream to
455 * generate the huffman tree.*/
456 if (0 == dict_file_length)
457 isal_update_histogram(stream, file_length, &histogram);
458 else
459 isal_update_histogram_dict(stream, dict_file_length, file_length,
460 &histogram);
461
462 fclose(file);
463 free(stream);
464 argi++;
465 }
466
467 isal_create_hufftables(&hufftables, &histogram);
468
469 file = fopen("hufftables_c.c", "w");
470 if (file == NULL) {
471 printf("Error creating file hufftables_c.c\n");
472 return 1;
473 }
474
475 fprint_header(file);
476
477 fprintf(file, "\n");
478
479 fprint_hufftables(file, "hufftables_default", &hufftables);
480
481 fprintf(file, "\n");
482
483 isal_deflate_stateless_init(&tmp_stream);
484 isal_deflate_set_hufftables(&tmp_stream, NULL, IGZIP_HUFFTABLE_STATIC);
485 fprint_hufftables(file, "hufftables_static", tmp_stream.hufftables);
486
487 fclose(file);
488
489 if (hist_file) {
490 int len = fwrite(&histogram, 1, sizeof(histogram), hist_file);
491 printf("wrote %d bytes of histogram file\n", len);
492 fclose(hist_file);
493 }
494 return 0;
495 }
496