xref: /isa-l/igzip/generate_custom_hufftables.c (revision 9ee34ec0f536d26f4b3a07de9f30104f83e4b312)
1 /**********************************************************************
2   Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3 
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions
6   are met:
7     * Redistributions of source code must retain the above copyright
8       notice, this list of conditions and the following disclaimer.
9     * Redistributions in binary form must reproduce the above copyright
10       notice, this list of conditions and the following disclaimer in
11       the documentation and/or other materials provided with the
12       distribution.
13     * Neither the name of Intel Corporation nor the names of its
14       contributors may be used to endorse or promote products derived
15       from this software without specific prior written permission.
16 
17   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 **********************************************************************/
29 
30 /* This program can be used to generate custom a custom huffman encoding to get
31  * better data compression. This is most useful when the type of data being
32  * compressed is well known.
33  *
34  * To use generate_custom_hufftables, pass a sequence of files to the program
35  * that together form an accurate representation of the data that is being
36  * compressed. Generate_custom_hufftables will then produce the file
37  * hufftables_c.c, which should be moved to replace its counterpart in the igzip
38  * source folder. After recompiling the Isa-l library, the igzip compression
39  * functions will use the new hufftables.
40  *
41  * Generate_custom_hufftables should be compiled with the same compile time
42  * parameters as the igzip source code. Generating custom hufftables with
43  * different compile time parameters may cause igzip to produce invalid output
44  * for the reasons described below. The default parameters used by
45  * generate_custom_hufftables are the same as the default parameters used by
46  * igzip.
47  *
48  * *WARNING* generate custom hufftables must be compiled with a IGZIP_HIST_SIZE
49  * that is at least as large as the IGZIP_HIST_SIZE used by igzip. By default
50  * IGZIP_HIST_SIZE is 32K, the maximum usable IGZIP_HIST_SIZE is 32K. The reason
51  * for this is to generate better compression. Igzip cannot produce look back
52  * distances with sizes larger than the IGZIP_HIST_SIZE igzip was compiled with,
53  * so look back distances with sizes larger than IGZIP_HIST_SIZE are not
54  * assigned a huffman code. The definition of LONGER_HUFFTABLES must be
55  * consistent as well since that definition changes the size of the structures
56  * printed by this tool.
57  *
58  */
59 
60 #include <stdint.h>
61 #include <stdio.h>
62 #include <inttypes.h>
63 #include <string.h>
64 #include <stdlib.h>
65 #include "igzip_lib.h"
66 
67 #include "huff_codes.h"
68 #include "huffman.h"
69 
70 /*These max code lengths are limited by how the data is stored in
71  * hufftables.asm. The deflate standard max is 15.*/
72 
73 #define MAX_HEADER_SIZE ISAL_DEF_MAX_HDR_SIZE
74 
75 #define GZIP_HEADER_SIZE 10
76 #define GZIP_TRAILER_SIZE 8
77 #define ZLIB_HEADER_SIZE 2
78 #define ZLIB_TRAILER_SIZE 4
79 
80 /**
81  * @brief Prints a table of uint8_t elements to a file.
82  * @param outfile: the file the table is printed to.
83  * @param table: the table to be printed.
84  * @param length: number of elements to be printed.
85  * @param header: header to append in front of the table.
86  * @param footer: footer to append at the end of the table.
87  * @param begin_line: string printed at beginning of new line
88  */
89 void fprint_uint8_table(FILE * outfile, uint8_t * table, uint64_t length, char *header,
90 			char *footer, char *begin_line)
91 {
92 	int i;
93 	fprintf(outfile, "%s", header);
94 	for (i = 0; i < length - 1; i++) {
95 		if ((i & 7) == 0)
96 			fprintf(outfile, "\n%s", begin_line);
97 		else
98 			fprintf(outfile, " ");
99 		fprintf(outfile, "0x%02x,", table[i]);
100 	}
101 
102 	if ((i & 7) == 0)
103 		fprintf(outfile, "\n%s", begin_line);
104 	else
105 		fprintf(outfile, " ");
106 	fprintf(outfile, "0x%02x", table[i]);
107 	fprintf(outfile, "%s", footer);
108 
109 }
110 
111 /**
112  * @brief Prints a table of uint16_t elements to a file.
113  * @param outfile: the file the table is printed to.
114  * @param table: the table to be printed.
115  * @param length: number of elements to be printed.
116  * @param header: header to append in front of the table.
117  * @param footer: footer to append at the end of the table.
118  * @param begin_line: string printed at beginning of new line
119  */
120 void fprint_uint16_table(FILE * outfile, uint16_t * table, uint64_t length, char *header,
121 			 char *footer, char *begin_line)
122 {
123 	int i;
124 	fprintf(outfile, "%s", header);
125 	for (i = 0; i < length - 1; i++) {
126 		if ((i & 7) == 0)
127 			fprintf(outfile, "\n%s", begin_line);
128 		else
129 			fprintf(outfile, " ");
130 		fprintf(outfile, "0x%04x,", table[i]);
131 	}
132 
133 	if ((i & 7) == 0)
134 		fprintf(outfile, "\n%s", begin_line);
135 	else
136 		fprintf(outfile, " ");
137 	fprintf(outfile, "0x%04x", table[i]);
138 	fprintf(outfile, "%s", footer);
139 
140 }
141 
142 /**
143  * @brief Prints a table of uint32_t elements to a file.
144  * @param outfile: the file the table is printed to.
145  * @param table: the table to be printed.
146  * @param length: number of elements to be printed.
147  * @param header: header to append in front of the table.
148  * @param footer: footer to append at the end of the table.
149  * @param begin_line: string printed at beginning of new line
150  */
151 void fprint_uint32_table(FILE * outfile, uint32_t * table, uint64_t length, char *header,
152 			 char *footer, char *begin_line)
153 {
154 	int i;
155 	fprintf(outfile, "%s", header);
156 	for (i = 0; i < length - 1; i++) {
157 		if ((i & 3) == 0)
158 			fprintf(outfile, "\n%s", begin_line);
159 		else
160 			fprintf(outfile, " ");
161 		fprintf(outfile, "0x%08x,", table[i]);
162 	}
163 
164 	if ((i & 3) == 0)
165 		fprintf(outfile, "%s", begin_line);
166 	else
167 		fprintf(outfile, " ");
168 	fprintf(outfile, "0x%08x", table[i]);
169 	fprintf(outfile, "%s", footer);
170 
171 }
172 
173 void fprint_hufftables(FILE * output_file, char *hufftables_name,
174 		       struct isal_hufftables *hufftables)
175 {
176 	fprintf(output_file, "struct isal_hufftables %s = {\n\n", hufftables_name);
177 
178 	fprint_uint8_table(output_file, hufftables->deflate_hdr,
179 			   hufftables->deflate_hdr_count +
180 			   (hufftables->deflate_hdr_extra_bits + 7) / 8,
181 			   "\t.deflate_hdr = {", "},\n\n", "\t\t");
182 
183 	fprintf(output_file, "\t.deflate_hdr_count = %d,\n", hufftables->deflate_hdr_count);
184 	fprintf(output_file, "\t.deflate_hdr_extra_bits = %d,\n\n",
185 		hufftables->deflate_hdr_extra_bits);
186 
187 	fprint_uint32_table(output_file, hufftables->dist_table, IGZIP_DIST_TABLE_SIZE,
188 			    "\t.dist_table = {", "},\n\n", "\t\t");
189 
190 	fprint_uint32_table(output_file, hufftables->len_table, IGZIP_LEN_TABLE_SIZE,
191 			    "\t.len_table = {", "},\n\n", "\t\t");
192 
193 	fprint_uint16_table(output_file, hufftables->lit_table, IGZIP_LIT_TABLE_SIZE,
194 			    "\t.lit_table = {", "},\n\n", "\t\t");
195 	fprint_uint8_table(output_file, hufftables->lit_table_sizes, IGZIP_LIT_TABLE_SIZE,
196 			   "\t.lit_table_sizes = {", "},\n\n", "\t\t");
197 
198 	fprint_uint16_table(output_file, hufftables->dcodes,
199 			    ISAL_DEF_DIST_SYMBOLS - IGZIP_DECODE_OFFSET,
200 			    "\t.dcodes = {", "},\n\n", "\t\t");
201 	fprint_uint8_table(output_file, hufftables->dcodes_sizes,
202 			   ISAL_DEF_DIST_SYMBOLS - IGZIP_DECODE_OFFSET,
203 			   "\t.dcodes_sizes = {", "}\n", "\t\t");
204 	fprintf(output_file, "};\n");
205 }
206 
207 void fprint_header(FILE * output_file)
208 {
209 
210 	fprintf(output_file, "#include <stdint.h>\n");
211 	fprintf(output_file, "#include <igzip_lib.h>\n\n");
212 
213 	fprintf(output_file, "#if IGZIP_HIST_SIZE > %d\n"
214 		"# error \"Invalid history size for the custom hufftable\"\n"
215 		"#endif\n", IGZIP_HIST_SIZE);
216 
217 #ifdef LONGER_HUFFTABLE
218 	fprintf(output_file, "#ifndef LONGER_HUFFTABLE\n"
219 		"# error \"Custom hufftable requires LONGER_HUFFTABLE to be defined \"\n"
220 		"#endif\n");
221 #else
222 	fprintf(output_file, "#ifdef LONGER_HUFFTABLE\n"
223 		"# error \"Custom hufftable requires LONGER_HUFFTABLE to not be defined \"\n"
224 		"#endif\n");
225 #endif
226 	fprintf(output_file, "\n");
227 
228 	fprintf(output_file, "const uint8_t gzip_hdr[] = {\n"
229 		"\t0x1f, 0x8b, 0x08, 0x00, 0x00,\n" "\t0x00, 0x00, 0x00, 0x00, 0xff\t};\n\n");
230 
231 	fprintf(output_file, "const uint32_t gzip_hdr_bytes = %d;\n", GZIP_HEADER_SIZE);
232 	fprintf(output_file, "const uint32_t gzip_trl_bytes = %d;\n\n", GZIP_TRAILER_SIZE);
233 
234 	fprintf(output_file, "const uint8_t zlib_hdr[] = { 0x78, 0x01 };\n\n");
235 	fprintf(output_file, "const uint32_t zlib_hdr_bytes = %d;\n", ZLIB_HEADER_SIZE);
236 	fprintf(output_file, "const uint32_t zlib_trl_bytes = %d;\n", ZLIB_TRAILER_SIZE);
237 }
238 
239 static uint32_t convert_dist_to_dist_sym(uint32_t dist)
240 {
241 	assert(dist <= 32768 && dist > 0);
242 	if (dist <= 32768) {
243 		uint32_t msb = dist > 4 ? bsr(dist - 1) - 2 : 0;
244 		return (msb * 2) + ((dist - 1) >> msb);
245 	} else {
246 		return ~0;
247 	}
248 }
249 
250 /**
251  * @brief  Returns the deflate symbol value for a repeat length.
252  */
253 static uint32_t convert_length_to_len_sym(uint32_t length)
254 {
255 	assert(length > 2 && length < 259);
256 
257 	/* Based on tables on page 11 in RFC 1951 */
258 	if (length < 11)
259 		return 257 + length - 3;
260 	else if (length < 19)
261 		return 261 + (length - 3) / 2;
262 	else if (length < 35)
263 		return 265 + (length - 3) / 4;
264 	else if (length < 67)
265 		return 269 + (length - 3) / 8;
266 	else if (length < 131)
267 		return 273 + (length - 3) / 16;
268 	else if (length < 258)
269 		return 277 + (length - 3) / 32;
270 	else
271 		return 285;
272 }
273 
274 void isal_update_histogram_dict(uint8_t * start_stream, int dict_length, int length,
275 				struct isal_huff_histogram *histogram)
276 {
277 	uint32_t literal = 0, hash;
278 	uint16_t seen, *last_seen = histogram->hash_table;
279 	uint8_t *current, *end_stream, *next_hash, *end, *end_dict;
280 	uint32_t match_length;
281 	uint32_t dist;
282 	uint64_t *lit_len_histogram = histogram->lit_len_histogram;
283 	uint64_t *dist_histogram = histogram->dist_histogram;
284 
285 	if (length <= 0)
286 		return;
287 
288 	end_stream = start_stream + dict_length + length;
289 	end_dict = start_stream + dict_length;
290 
291 	memset(last_seen, 0, sizeof(histogram->hash_table));	/* Initialize last_seen to be 0. */
292 
293 	for (current = start_stream; current < end_dict - 4; current++) {
294 		literal = load_le_u32(current);
295 		hash = compute_hash(literal) & LVL0_HASH_MASK;
296 		last_seen[hash] = (current - start_stream) & 0xFFFF;
297 	}
298 
299 	for (current = start_stream + dict_length; current < end_stream - 3; current++) {
300 		literal = load_le_u32(current);
301 		hash = compute_hash(literal) & LVL0_HASH_MASK;
302 		seen = last_seen[hash];
303 		last_seen[hash] = (current - start_stream) & 0xFFFF;
304 		dist = (current - start_stream - seen) & 0xFFFF;
305 		if (dist - 1 < D - 1) {
306 			assert(start_stream <= current - dist);
307 			match_length =
308 			    compare258(current - dist, current, end_stream - current);
309 			if (match_length >= SHORTEST_MATCH) {
310 				next_hash = current;
311 #ifdef ISAL_LIMIT_HASH_UPDATE
312 				end = next_hash + 3;
313 #else
314 				end = next_hash + match_length;
315 #endif
316 				if (end > end_stream - 3)
317 					end = end_stream - 3;
318 				next_hash++;
319 				for (; next_hash < end; next_hash++) {
320 					literal = load_le_u32(next_hash);
321 					hash = compute_hash(literal) & LVL0_HASH_MASK;
322 					last_seen[hash] = (next_hash - start_stream) & 0xFFFF;
323 				}
324 
325 				dist_histogram[convert_dist_to_dist_sym(dist)] += 1;
326 				lit_len_histogram[convert_length_to_len_sym(match_length)] +=
327 				    1;
328 				current += match_length - 1;
329 				continue;
330 			}
331 		}
332 		lit_len_histogram[literal & 0xFF] += 1;
333 	}
334 
335 	for (; current < end_stream; current++)
336 		lit_len_histogram[*current] += 1;
337 
338 	lit_len_histogram[256] += 1;
339 	return;
340 }
341 
342 int main(int argc, char *argv[])
343 {
344 	long int file_length;
345 	int argi = 1;
346 	uint8_t *stream = NULL;
347 	struct isal_hufftables hufftables;
348 	struct isal_huff_histogram histogram;
349 	struct isal_zstream tmp_stream;
350 	FILE *file = NULL;
351 	FILE *dict_file = NULL;
352 	FILE *hist_file = NULL;
353 	long int dict_file_length = 0;
354 	long int hist_file_length = 0;
355 	uint8_t *dict_stream = NULL;
356 
357 	if (argc == 1) {
358 		printf("Error, no input file.\n");
359 		return 1;
360 	}
361 
362 	if (argc > 3 && argv[1][0] == '-' && argv[1][1] == 'd') {
363 		dict_file = fopen(argv[2], "r");
364 		if (dict_file == NULL) {
365 			printf("File \"%s\" open error!\n", argv[2]);
366 			return 1;
367 		}
368 
369 		fseek(dict_file, 0, SEEK_END);
370 		dict_file_length = ftell(dict_file);
371 		fseek(dict_file, 0, SEEK_SET);
372 		dict_file_length -= ftell(dict_file);
373 		dict_stream = malloc(dict_file_length);
374 		if (dict_stream == NULL) {
375 			printf("Failed to allocate memory to read in dictionary file\n");
376 			fclose(dict_file);
377 			return 1;
378 		}
379 		if (fread(dict_stream, 1, dict_file_length, dict_file) != dict_file_length) {
380 			printf("Error occurred when reading dictionary file");
381 			fclose(dict_file);
382 			free(dict_stream);
383 			return 1;
384 		}
385 		isal_update_histogram(dict_stream, dict_file_length, &histogram);
386 
387 		printf("Read %ld bytes of dictionary file %s\n", dict_file_length, argv[2]);
388 		argi += 2;
389 		fclose(dict_file);
390 		free(dict_stream);
391 	}
392 
393 	if ((argc > argi + 1) && argv[argi][0] == '-' && argv[argi][1] == 'h') {
394 		hist_file = fopen(argv[argi + 1], "r+");
395 		if (hist_file == NULL) {
396 			printf("File \"%s\" open error!\n", argv[argi + 1]);
397 			return 1;
398 		}
399 		fseek(hist_file, 0, SEEK_END);
400 		hist_file_length = ftell(hist_file);
401 		fseek(hist_file, 0, SEEK_SET);
402 		hist_file_length -= ftell(hist_file);
403 		if (hist_file_length > sizeof(histogram)) {
404 			printf("Histogram file too long\n");
405 			return 1;
406 		}
407 		if (fread(&histogram, 1, hist_file_length, hist_file) != hist_file_length) {
408 			printf("Error occurred when reading history file");
409 			fclose(hist_file);
410 			return 1;
411 		}
412 		fseek(hist_file, 0, SEEK_SET);
413 
414 		printf("Read %ld bytes of history file %s\n", hist_file_length,
415 		       argv[argi + 1]);
416 		argi += 2;
417 	} else
418 		memset(&histogram, 0, sizeof(histogram));	/* Initialize histograms. */
419 
420 	while (argi < argc) {
421 		printf("Processing %s\n", argv[argi]);
422 		file = fopen(argv[argi], "r");
423 		if (file == NULL) {
424 			printf("Error opening file\n");
425 			return 1;
426 		}
427 		fseek(file, 0, SEEK_END);
428 		file_length = ftell(file);
429 		fseek(file, 0, SEEK_SET);
430 		file_length -= ftell(file);
431 		stream = malloc(file_length + dict_file_length);
432 		if (stream == NULL) {
433 			printf("Failed to allocate memory to read in file\n");
434 			fclose(file);
435 			return 1;
436 		}
437 		if (dict_file_length > 0)
438 			memcpy(stream, dict_stream, dict_file_length);
439 
440 		if (fread(&stream[dict_file_length], 1, file_length, file) != file_length) {
441 			printf("Error occurred when reading file");
442 			fclose(file);
443 			free(stream);
444 			return 1;
445 		}
446 
447 		/* Create a histogram of frequency of symbols found in stream to
448 		 * generate the huffman tree.*/
449 		if (0 == dict_file_length)
450 			isal_update_histogram(stream, file_length, &histogram);
451 		else
452 			isal_update_histogram_dict(stream, dict_file_length, file_length,
453 						   &histogram);
454 
455 		fclose(file);
456 		free(stream);
457 		argi++;
458 	}
459 
460 	isal_create_hufftables(&hufftables, &histogram);
461 
462 	file = fopen("hufftables_c.c", "w");
463 	if (file == NULL) {
464 		printf("Error creating file hufftables_c.c\n");
465 		return 1;
466 	}
467 
468 	fprint_header(file);
469 
470 	fprintf(file, "\n");
471 
472 	fprint_hufftables(file, "hufftables_default", &hufftables);
473 
474 	fprintf(file, "\n");
475 
476 	isal_deflate_stateless_init(&tmp_stream);
477 	isal_deflate_set_hufftables(&tmp_stream, NULL, IGZIP_HUFFTABLE_STATIC);
478 	fprint_hufftables(file, "hufftables_static", tmp_stream.hufftables);
479 
480 	fclose(file);
481 
482 	if (hist_file) {
483 		int len = fwrite(&histogram, 1, sizeof(histogram), hist_file);
484 		printf("wrote %d bytes of histogram file\n", len);
485 		fclose(hist_file);
486 	}
487 	return 0;
488 }
489