xref: /isa-l/igzip/generate_custom_hufftables.c (revision 55fbfabfc60f1002bc8133b730a59f6abd22cfce)
1 /**********************************************************************
2   Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3 
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions
6   are met:
7     * Redistributions of source code must retain the above copyright
8       notice, this list of conditions and the following disclaimer.
9     * Redistributions in binary form must reproduce the above copyright
10       notice, this list of conditions and the following disclaimer in
11       the documentation and/or other materials provided with the
12       distribution.
13     * Neither the name of Intel Corporation nor the names of its
14       contributors may be used to endorse or promote products derived
15       from this software without specific prior written permission.
16 
17   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 **********************************************************************/
29 
30 /* This program can be used to generate custom a custom huffman encoding to get
31  * better data compression. This is most useful when the type of data being
32  * compressed is well known.
33  *
34  * To use generate_custom_hufftables, pass a sequence of files to the program
35  * that together form an accurate representation of the data that is being
36  * compressed. Generate_custom_hufftables will then produce the file
37  * hufftables_c.c, which should be moved to replace its counterpart in the igzip
38  * source folder. After recompiling the Isa-l library, the igzip compression
39  * functions will use the new hufftables.
40  *
41  * Generate_custom_hufftables should be compiled with the same compile time
42  * parameters as the igzip source code. Generating custom hufftables with
43  * different compile time parameters may cause igzip to produce invalid output
44  * for the reasons described below. The default parameters used by
45  * generate_custom_hufftables are the same as the default parameters used by
46  * igzip.
47  *
48  * *WARNING* generate custom hufftables must be compiled with a IGZIP_HIST_SIZE
49  * that is at least as large as the IGZIP_HIST_SIZE used by igzip. By default
50  * IGZIP_HIST_SIZE is 32K, the maximum usable IGZIP_HIST_SIZE is 32K. The reason
51  * for this is to generate better compression. Igzip cannot produce look back
52  * distances with sizes larger than the IGZIP_HIST_SIZE igzip was compiled with,
53  * so look back distances with sizes larger than IGZIP_HIST_SIZE are not
54  * assigned a huffman code. The definition of LONGER_HUFFTABLES must be
55  * consistent as well since that definition changes the size of the structures
56  * printed by this tool.
57  *
58  */
59 
60 #include <stdint.h>
61 #include <stdio.h>
62 #include <inttypes.h>
63 #include <string.h>
64 #include <stdlib.h>
65 #include "igzip_lib.h"
66 
67 #include "huff_codes.h"
68 #include "huffman.h"
69 
70 /*These max code lengths are limited by how the data is stored in
71  * hufftables.asm. The deflate standard max is 15.*/
72 
73 #define MAX_HEADER_SIZE ISAL_DEF_MAX_HDR_SIZE
74 
75 #define GZIP_HEADER_SIZE  10
76 #define GZIP_TRAILER_SIZE 8
77 #define ZLIB_HEADER_SIZE  2
78 #define ZLIB_TRAILER_SIZE 4
79 
80 /**
81  * @brief Prints a table of uint8_t elements to a file.
82  * @param outfile: the file the table is printed to.
83  * @param table: the table to be printed.
84  * @param length: number of elements to be printed.
85  * @param header: header to append in front of the table.
86  * @param footer: footer to append at the end of the table.
87  * @param begin_line: string printed at beginning of new line
88  */
89 void
fprint_uint8_table(FILE * outfile,uint8_t * table,uint64_t length,char * header,char * footer,char * begin_line)90 fprint_uint8_table(FILE *outfile, uint8_t *table, uint64_t length, char *header, char *footer,
91                    char *begin_line)
92 {
93         int i;
94         fprintf(outfile, "%s", header);
95         for (i = 0; i < length - 1; i++) {
96                 if ((i & 7) == 0)
97                         fprintf(outfile, "\n%s", begin_line);
98                 else
99                         fprintf(outfile, " ");
100                 fprintf(outfile, "0x%02x,", table[i]);
101         }
102 
103         if ((i & 7) == 0)
104                 fprintf(outfile, "\n%s", begin_line);
105         else
106                 fprintf(outfile, " ");
107         fprintf(outfile, "0x%02x", table[i]);
108         fprintf(outfile, "%s", footer);
109 }
110 
111 /**
112  * @brief Prints a table of uint16_t elements to a file.
113  * @param outfile: the file the table is printed to.
114  * @param table: the table to be printed.
115  * @param length: number of elements to be printed.
116  * @param header: header to append in front of the table.
117  * @param footer: footer to append at the end of the table.
118  * @param begin_line: string printed at beginning of new line
119  */
120 void
fprint_uint16_table(FILE * outfile,uint16_t * table,uint64_t length,char * header,char * footer,char * begin_line)121 fprint_uint16_table(FILE *outfile, uint16_t *table, uint64_t length, char *header, char *footer,
122                     char *begin_line)
123 {
124         int i;
125         fprintf(outfile, "%s", header);
126         for (i = 0; i < length - 1; i++) {
127                 if ((i & 7) == 0)
128                         fprintf(outfile, "\n%s", begin_line);
129                 else
130                         fprintf(outfile, " ");
131                 fprintf(outfile, "0x%04x,", table[i]);
132         }
133 
134         if ((i & 7) == 0)
135                 fprintf(outfile, "\n%s", begin_line);
136         else
137                 fprintf(outfile, " ");
138         fprintf(outfile, "0x%04x", table[i]);
139         fprintf(outfile, "%s", footer);
140 }
141 
142 /**
143  * @brief Prints a table of uint32_t elements to a file.
144  * @param outfile: the file the table is printed to.
145  * @param table: the table to be printed.
146  * @param length: number of elements to be printed.
147  * @param header: header to append in front of the table.
148  * @param footer: footer to append at the end of the table.
149  * @param begin_line: string printed at beginning of new line
150  */
151 void
fprint_uint32_table(FILE * outfile,uint32_t * table,uint64_t length,char * header,char * footer,char * begin_line)152 fprint_uint32_table(FILE *outfile, uint32_t *table, uint64_t length, char *header, char *footer,
153                     char *begin_line)
154 {
155         int i;
156         fprintf(outfile, "%s", header);
157         for (i = 0; i < length - 1; i++) {
158                 if ((i & 3) == 0)
159                         fprintf(outfile, "\n%s", begin_line);
160                 else
161                         fprintf(outfile, " ");
162                 fprintf(outfile, "0x%08x,", table[i]);
163         }
164 
165         if ((i & 3) == 0)
166                 fprintf(outfile, "%s", begin_line);
167         else
168                 fprintf(outfile, " ");
169         fprintf(outfile, "0x%08x", table[i]);
170         fprintf(outfile, "%s", footer);
171 }
172 
173 void
fprint_hufftables(FILE * output_file,char * hufftables_name,struct isal_hufftables * hufftables)174 fprint_hufftables(FILE *output_file, char *hufftables_name, struct isal_hufftables *hufftables)
175 {
176         fprintf(output_file, "struct isal_hufftables %s = {\n\n", hufftables_name);
177 
178         fprint_uint8_table(output_file, hufftables->deflate_hdr,
179                            hufftables->deflate_hdr_count +
180                                    (hufftables->deflate_hdr_extra_bits + 7) / 8,
181                            "\t.deflate_hdr = {", "},\n\n", "\t\t");
182 
183         fprintf(output_file, "\t.deflate_hdr_count = %d,\n", hufftables->deflate_hdr_count);
184         fprintf(output_file, "\t.deflate_hdr_extra_bits = %d,\n\n",
185                 hufftables->deflate_hdr_extra_bits);
186 
187         fprint_uint32_table(output_file, hufftables->dist_table, IGZIP_DIST_TABLE_SIZE,
188                             "\t.dist_table = {", "},\n\n", "\t\t");
189 
190         fprint_uint32_table(output_file, hufftables->len_table, IGZIP_LEN_TABLE_SIZE,
191                             "\t.len_table = {", "},\n\n", "\t\t");
192 
193         fprint_uint16_table(output_file, hufftables->lit_table, IGZIP_LIT_TABLE_SIZE,
194                             "\t.lit_table = {", "},\n\n", "\t\t");
195         fprint_uint8_table(output_file, hufftables->lit_table_sizes, IGZIP_LIT_TABLE_SIZE,
196                            "\t.lit_table_sizes = {", "},\n\n", "\t\t");
197 
198         fprint_uint16_table(output_file, hufftables->dcodes,
199                             ISAL_DEF_DIST_SYMBOLS - IGZIP_DECODE_OFFSET, "\t.dcodes = {", "},\n\n",
200                             "\t\t");
201         fprint_uint8_table(output_file, hufftables->dcodes_sizes,
202                            ISAL_DEF_DIST_SYMBOLS - IGZIP_DECODE_OFFSET, "\t.dcodes_sizes = {",
203                            "}\n", "\t\t");
204         fprintf(output_file, "};\n");
205 }
206 
207 void
fprint_header(FILE * output_file)208 fprint_header(FILE *output_file)
209 {
210 
211         fprintf(output_file, "#include <stdint.h>\n");
212         fprintf(output_file, "#include <igzip_lib.h>\n\n");
213 
214         fprintf(output_file,
215                 "#if IGZIP_HIST_SIZE > %d\n"
216                 "# error \"Invalid history size for the custom hufftable\"\n"
217                 "#endif\n",
218                 IGZIP_HIST_SIZE);
219 
220 #ifdef LONGER_HUFFTABLE
221         fprintf(output_file,
222                 "#ifndef LONGER_HUFFTABLE\n"
223                 "# error \"Custom hufftable requires LONGER_HUFFTABLE to be defined \"\n"
224                 "#endif\n");
225 #else
226         fprintf(output_file,
227                 "#ifdef LONGER_HUFFTABLE\n"
228                 "# error \"Custom hufftable requires LONGER_HUFFTABLE to not be defined \"\n"
229                 "#endif\n");
230 #endif
231         fprintf(output_file, "\n");
232 
233         fprintf(output_file, "const uint8_t gzip_hdr[] = {\n"
234                              "\t0x1f, 0x8b, 0x08, 0x00, 0x00,\n"
235                              "\t0x00, 0x00, 0x00, 0x00, 0xff\t};\n\n");
236 
237         fprintf(output_file, "const uint32_t gzip_hdr_bytes = %d;\n", GZIP_HEADER_SIZE);
238         fprintf(output_file, "const uint32_t gzip_trl_bytes = %d;\n\n", GZIP_TRAILER_SIZE);
239 
240         fprintf(output_file, "const uint8_t zlib_hdr[] = { 0x78, 0x01 };\n\n");
241         fprintf(output_file, "const uint32_t zlib_hdr_bytes = %d;\n", ZLIB_HEADER_SIZE);
242         fprintf(output_file, "const uint32_t zlib_trl_bytes = %d;\n", ZLIB_TRAILER_SIZE);
243 }
244 
245 static uint32_t
convert_dist_to_dist_sym(uint32_t dist)246 convert_dist_to_dist_sym(uint32_t dist)
247 {
248         assert(dist <= 32768 && dist > 0);
249         if (dist <= 32768) {
250                 uint32_t msb = dist > 4 ? bsr(dist - 1) - 2 : 0;
251                 return (msb * 2) + ((dist - 1) >> msb);
252         } else {
253                 return ~0;
254         }
255 }
256 
257 /**
258  * @brief  Returns the deflate symbol value for a repeat length.
259  */
260 static uint32_t
convert_length_to_len_sym(uint32_t length)261 convert_length_to_len_sym(uint32_t length)
262 {
263         assert(length > 2 && length < 259);
264 
265         /* Based on tables on page 11 in RFC 1951 */
266         if (length < 11)
267                 return 257 + length - 3;
268         else if (length < 19)
269                 return 261 + (length - 3) / 2;
270         else if (length < 35)
271                 return 265 + (length - 3) / 4;
272         else if (length < 67)
273                 return 269 + (length - 3) / 8;
274         else if (length < 131)
275                 return 273 + (length - 3) / 16;
276         else if (length < 258)
277                 return 277 + (length - 3) / 32;
278         else
279                 return 285;
280 }
281 
282 void
isal_update_histogram_dict(uint8_t * start_stream,int dict_length,int length,struct isal_huff_histogram * histogram)283 isal_update_histogram_dict(uint8_t *start_stream, int dict_length, int length,
284                            struct isal_huff_histogram *histogram)
285 {
286         uint32_t literal = 0, hash;
287         uint16_t seen, *last_seen = histogram->hash_table;
288         uint8_t *current, *end_stream, *next_hash, *end, *end_dict;
289         uint32_t match_length;
290         uint32_t dist;
291         uint64_t *lit_len_histogram = histogram->lit_len_histogram;
292         uint64_t *dist_histogram = histogram->dist_histogram;
293 
294         if (length <= 0)
295                 return;
296 
297         end_stream = start_stream + dict_length + length;
298         end_dict = start_stream + dict_length;
299 
300         memset(last_seen, 0, sizeof(histogram->hash_table)); /* Initialize last_seen to be 0. */
301 
302         for (current = start_stream; current < end_dict - 4; current++) {
303                 literal = load_le_u32(current);
304                 hash = compute_hash(literal) & LVL0_HASH_MASK;
305                 last_seen[hash] = (current - start_stream) & 0xFFFF;
306         }
307 
308         for (current = start_stream + dict_length; current < end_stream - 3; current++) {
309                 literal = load_le_u32(current);
310                 hash = compute_hash(literal) & LVL0_HASH_MASK;
311                 seen = last_seen[hash];
312                 last_seen[hash] = (current - start_stream) & 0xFFFF;
313                 dist = (current - start_stream - seen) & 0xFFFF;
314                 if (dist - 1 < D - 1) {
315                         assert(start_stream <= current - dist);
316                         match_length = compare258(current - dist, current, end_stream - current);
317                         if (match_length >= SHORTEST_MATCH) {
318                                 next_hash = current;
319 #ifdef ISAL_LIMIT_HASH_UPDATE
320                                 end = next_hash + 3;
321 #else
322                                 end = next_hash + match_length;
323 #endif
324                                 if (end > end_stream - 3)
325                                         end = end_stream - 3;
326                                 next_hash++;
327                                 for (; next_hash < end; next_hash++) {
328                                         literal = load_le_u32(next_hash);
329                                         hash = compute_hash(literal) & LVL0_HASH_MASK;
330                                         last_seen[hash] = (next_hash - start_stream) & 0xFFFF;
331                                 }
332 
333                                 dist_histogram[convert_dist_to_dist_sym(dist)] += 1;
334                                 lit_len_histogram[convert_length_to_len_sym(match_length)] += 1;
335                                 current += match_length - 1;
336                                 continue;
337                         }
338                 }
339                 lit_len_histogram[literal & 0xFF] += 1;
340         }
341 
342         for (; current < end_stream; current++)
343                 lit_len_histogram[*current] += 1;
344 
345         lit_len_histogram[256] += 1;
346         return;
347 }
348 
349 int
main(int argc,char * argv[])350 main(int argc, char *argv[])
351 {
352         long int file_length;
353         int argi = 1;
354         uint8_t *stream = NULL;
355         struct isal_hufftables hufftables;
356         struct isal_huff_histogram histogram;
357         struct isal_zstream tmp_stream;
358         FILE *file = NULL;
359         FILE *dict_file = NULL;
360         FILE *hist_file = NULL;
361         long int dict_file_length = 0;
362         long int hist_file_length = 0;
363         uint8_t *dict_stream = NULL;
364 
365         if (argc == 1) {
366                 printf("Error, no input file.\n");
367                 return 1;
368         }
369 
370         if (argc > 3 && argv[1][0] == '-' && argv[1][1] == 'd') {
371                 dict_file = fopen(argv[2], "r");
372                 if (dict_file == NULL) {
373                         printf("File \"%s\" open error!\n", argv[2]);
374                         return 1;
375                 }
376 
377                 fseek(dict_file, 0, SEEK_END);
378                 dict_file_length = ftell(dict_file);
379                 fseek(dict_file, 0, SEEK_SET);
380                 dict_file_length -= ftell(dict_file);
381                 dict_stream = malloc(dict_file_length);
382                 if (dict_stream == NULL) {
383                         printf("Failed to allocate memory to read in dictionary file\n");
384                         fclose(dict_file);
385                         return 1;
386                 }
387                 if (fread(dict_stream, 1, dict_file_length, dict_file) != dict_file_length) {
388                         printf("Error occurred when reading dictionary file");
389                         fclose(dict_file);
390                         free(dict_stream);
391                         return 1;
392                 }
393                 isal_update_histogram(dict_stream, dict_file_length, &histogram);
394 
395                 printf("Read %ld bytes of dictionary file %s\n", dict_file_length, argv[2]);
396                 argi += 2;
397                 fclose(dict_file);
398                 free(dict_stream);
399         }
400 
401         if ((argc > argi + 1) && argv[argi][0] == '-' && argv[argi][1] == 'h') {
402                 hist_file = fopen(argv[argi + 1], "r+");
403                 if (hist_file == NULL) {
404                         printf("File \"%s\" open error!\n", argv[argi + 1]);
405                         return 1;
406                 }
407                 fseek(hist_file, 0, SEEK_END);
408                 hist_file_length = ftell(hist_file);
409                 fseek(hist_file, 0, SEEK_SET);
410                 hist_file_length -= ftell(hist_file);
411                 if (hist_file_length > sizeof(histogram)) {
412                         printf("Histogram file too long\n");
413                         return 1;
414                 }
415                 if (fread(&histogram, 1, hist_file_length, hist_file) != hist_file_length) {
416                         printf("Error occurred when reading history file");
417                         fclose(hist_file);
418                         return 1;
419                 }
420                 fseek(hist_file, 0, SEEK_SET);
421 
422                 printf("Read %ld bytes of history file %s\n", hist_file_length, argv[argi + 1]);
423                 argi += 2;
424         } else
425                 memset(&histogram, 0, sizeof(histogram)); /* Initialize histograms. */
426 
427         while (argi < argc) {
428                 printf("Processing %s\n", argv[argi]);
429                 file = fopen(argv[argi], "r");
430                 if (file == NULL) {
431                         printf("Error opening file\n");
432                         return 1;
433                 }
434                 fseek(file, 0, SEEK_END);
435                 file_length = ftell(file);
436                 fseek(file, 0, SEEK_SET);
437                 file_length -= ftell(file);
438                 stream = malloc(file_length + dict_file_length);
439                 if (stream == NULL) {
440                         printf("Failed to allocate memory to read in file\n");
441                         fclose(file);
442                         return 1;
443                 }
444                 if (dict_file_length > 0)
445                         memcpy(stream, dict_stream, dict_file_length);
446 
447                 if (fread(&stream[dict_file_length], 1, file_length, file) != file_length) {
448                         printf("Error occurred when reading file");
449                         fclose(file);
450                         free(stream);
451                         return 1;
452                 }
453 
454                 /* Create a histogram of frequency of symbols found in stream to
455                  * generate the huffman tree.*/
456                 if (0 == dict_file_length)
457                         isal_update_histogram(stream, file_length, &histogram);
458                 else
459                         isal_update_histogram_dict(stream, dict_file_length, file_length,
460                                                    &histogram);
461 
462                 fclose(file);
463                 free(stream);
464                 argi++;
465         }
466 
467         isal_create_hufftables(&hufftables, &histogram);
468 
469         file = fopen("hufftables_c.c", "w");
470         if (file == NULL) {
471                 printf("Error creating file hufftables_c.c\n");
472                 return 1;
473         }
474 
475         fprint_header(file);
476 
477         fprintf(file, "\n");
478 
479         fprint_hufftables(file, "hufftables_default", &hufftables);
480 
481         fprintf(file, "\n");
482 
483         isal_deflate_stateless_init(&tmp_stream);
484         isal_deflate_set_hufftables(&tmp_stream, NULL, IGZIP_HUFFTABLE_STATIC);
485         fprint_hufftables(file, "hufftables_static", tmp_stream.hufftables);
486 
487         fclose(file);
488 
489         if (hist_file) {
490                 int len = fwrite(&histogram, 1, sizeof(histogram), hist_file);
491                 printf("wrote %d bytes of histogram file\n", len);
492                 fclose(hist_file);
493         }
494         return 0;
495 }
496