1 /********************************************************************** 2 Copyright(c) 2011-2016 Intel Corporation All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions 6 are met: 7 * Redistributions of source code must retain the above copyright 8 notice, this list of conditions and the following disclaimer. 9 * Redistributions in binary form must reproduce the above copyright 10 notice, this list of conditions and the following disclaimer in 11 the documentation and/or other materials provided with the 12 distribution. 13 * Neither the name of Intel Corporation nor the names of its 14 contributors may be used to endorse or promote products derived 15 from this software without specific prior written permission. 16 17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 **********************************************************************/ 29 30 /* This program can be used to generate custom a custom huffman encoding to get 31 * better data compression. This is most useful when the type of data being 32 * compressed is well known. 33 * 34 * To use generate_custom_hufftables, pass a sequence of files to the program 35 * that together form an accurate representation of the data that is being 36 * compressed. Generate_custom_hufftables will then produce the file 37 * hufftables_c.c, which should be moved to replace its counterpart in the igzip 38 * source folder. After recompiling the Isa-l library, the igzip compression 39 * functions will use the new hufftables. 40 * 41 * Generate_custom_hufftables should be compiled with the same compile time 42 * parameters as the igzip source code. Generating custom hufftables with 43 * different compile time parameters may cause igzip to produce invalid output 44 * for the reasons described below. The default parameters used by 45 * generate_custom_hufftables are the same as the default parameters used by 46 * igzip. 47 * 48 * *WARNING* generate custom hufftables must be compiled with a IGZIP_HIST_SIZE 49 * that is at least as large as the IGZIP_HIST_SIZE used by igzip. By default 50 * IGZIP_HIST_SIZE is 32K, the maximum usable IGZIP_HIST_SIZE is 32K. The reason 51 * for this is to generate better compression. Igzip cannot produce look back 52 * distances with sizes larger than the IGZIP_HIST_SIZE igzip was compiled with, 53 * so look back distances with sizes larger than IGZIP_HIST_SIZE are not 54 * assigned a huffman code. The definition of LONGER_HUFFTABLES must be 55 * consistent as well since that definition changes the size of the structures 56 * printed by this tool. 57 * 58 */ 59 60 #include <stdint.h> 61 #include <stdio.h> 62 #include <inttypes.h> 63 #include <string.h> 64 #include <stdlib.h> 65 #include "igzip_lib.h" 66 67 #include "huff_codes.h" 68 #include "huffman.h" 69 70 /*These max code lengths are limited by how the data is stored in 71 * hufftables.asm. The deflate standard max is 15.*/ 72 73 #define MAX_HEADER_SIZE ISAL_DEF_MAX_HDR_SIZE 74 75 #define GZIP_HEADER_SIZE 10 76 #define GZIP_TRAILER_SIZE 8 77 #define ZLIB_HEADER_SIZE 2 78 #define ZLIB_TRAILER_SIZE 4 79 80 /** 81 * @brief Prints a table of uint8_t elements to a file. 82 * @param outfile: the file the table is printed to. 83 * @param table: the table to be printed. 84 * @param length: number of elements to be printed. 85 * @param header: header to append in front of the table. 86 * @param footer: footer to append at the end of the table. 87 * @param begin_line: string printed at beginning of new line 88 */ 89 void fprint_uint8_table(FILE * outfile, uint8_t * table, uint64_t length, char *header, 90 char *footer, char *begin_line) 91 { 92 int i; 93 fprintf(outfile, "%s", header); 94 for (i = 0; i < length - 1; i++) { 95 if ((i & 7) == 0) 96 fprintf(outfile, "\n%s", begin_line); 97 else 98 fprintf(outfile, " "); 99 fprintf(outfile, "0x%02x,", table[i]); 100 } 101 102 if ((i & 7) == 0) 103 fprintf(outfile, "\n%s", begin_line); 104 else 105 fprintf(outfile, " "); 106 fprintf(outfile, "0x%02x", table[i]); 107 fprintf(outfile, "%s", footer); 108 109 } 110 111 /** 112 * @brief Prints a table of uint16_t elements to a file. 113 * @param outfile: the file the table is printed to. 114 * @param table: the table to be printed. 115 * @param length: number of elements to be printed. 116 * @param header: header to append in front of the table. 117 * @param footer: footer to append at the end of the table. 118 * @param begin_line: string printed at beginning of new line 119 */ 120 void fprint_uint16_table(FILE * outfile, uint16_t * table, uint64_t length, char *header, 121 char *footer, char *begin_line) 122 { 123 int i; 124 fprintf(outfile, "%s", header); 125 for (i = 0; i < length - 1; i++) { 126 if ((i & 7) == 0) 127 fprintf(outfile, "\n%s", begin_line); 128 else 129 fprintf(outfile, " "); 130 fprintf(outfile, "0x%04x,", table[i]); 131 } 132 133 if ((i & 7) == 0) 134 fprintf(outfile, "\n%s", begin_line); 135 else 136 fprintf(outfile, " "); 137 fprintf(outfile, "0x%04x", table[i]); 138 fprintf(outfile, "%s", footer); 139 140 } 141 142 /** 143 * @brief Prints a table of uint32_t elements to a file. 144 * @param outfile: the file the table is printed to. 145 * @param table: the table to be printed. 146 * @param length: number of elements to be printed. 147 * @param header: header to append in front of the table. 148 * @param footer: footer to append at the end of the table. 149 * @param begin_line: string printed at beginning of new line 150 */ 151 void fprint_uint32_table(FILE * outfile, uint32_t * table, uint64_t length, char *header, 152 char *footer, char *begin_line) 153 { 154 int i; 155 fprintf(outfile, "%s", header); 156 for (i = 0; i < length - 1; i++) { 157 if ((i & 3) == 0) 158 fprintf(outfile, "\n%s", begin_line); 159 else 160 fprintf(outfile, " "); 161 fprintf(outfile, "0x%08x,", table[i]); 162 } 163 164 if ((i & 3) == 0) 165 fprintf(outfile, "%s", begin_line); 166 else 167 fprintf(outfile, " "); 168 fprintf(outfile, "0x%08x", table[i]); 169 fprintf(outfile, "%s", footer); 170 171 } 172 173 void fprint_hufftables(FILE * output_file, char *hufftables_name, 174 struct isal_hufftables *hufftables) 175 { 176 fprintf(output_file, "struct isal_hufftables %s = {\n\n", hufftables_name); 177 178 fprint_uint8_table(output_file, hufftables->deflate_hdr, 179 hufftables->deflate_hdr_count + 180 (hufftables->deflate_hdr_extra_bits + 7) / 8, 181 "\t.deflate_hdr = {", "},\n\n", "\t\t"); 182 183 fprintf(output_file, "\t.deflate_hdr_count = %d,\n", hufftables->deflate_hdr_count); 184 fprintf(output_file, "\t.deflate_hdr_extra_bits = %d,\n\n", 185 hufftables->deflate_hdr_extra_bits); 186 187 fprint_uint32_table(output_file, hufftables->dist_table, IGZIP_DIST_TABLE_SIZE, 188 "\t.dist_table = {", "},\n\n", "\t\t"); 189 190 fprint_uint32_table(output_file, hufftables->len_table, IGZIP_LEN_TABLE_SIZE, 191 "\t.len_table = {", "},\n\n", "\t\t"); 192 193 fprint_uint16_table(output_file, hufftables->lit_table, IGZIP_LIT_TABLE_SIZE, 194 "\t.lit_table = {", "},\n\n", "\t\t"); 195 fprint_uint8_table(output_file, hufftables->lit_table_sizes, IGZIP_LIT_TABLE_SIZE, 196 "\t.lit_table_sizes = {", "},\n\n", "\t\t"); 197 198 fprint_uint16_table(output_file, hufftables->dcodes, 199 ISAL_DEF_DIST_SYMBOLS - IGZIP_DECODE_OFFSET, 200 "\t.dcodes = {", "},\n\n", "\t\t"); 201 fprint_uint8_table(output_file, hufftables->dcodes_sizes, 202 ISAL_DEF_DIST_SYMBOLS - IGZIP_DECODE_OFFSET, 203 "\t.dcodes_sizes = {", "}\n", "\t\t"); 204 fprintf(output_file, "};\n"); 205 } 206 207 void fprint_header(FILE * output_file) 208 { 209 210 fprintf(output_file, "#include <stdint.h>\n"); 211 fprintf(output_file, "#include <igzip_lib.h>\n\n"); 212 213 fprintf(output_file, "#if IGZIP_HIST_SIZE > %d\n" 214 "# error \"Invalid history size for the custom hufftable\"\n" 215 "#endif\n", IGZIP_HIST_SIZE); 216 217 #ifdef LONGER_HUFFTABLE 218 fprintf(output_file, "#ifndef LONGER_HUFFTABLE\n" 219 "# error \"Custom hufftable requires LONGER_HUFFTABLE to be defined \"\n" 220 "#endif\n"); 221 #else 222 fprintf(output_file, "#ifdef LONGER_HUFFTABLE\n" 223 "# error \"Custom hufftable requires LONGER_HUFFTABLE to not be defined \"\n" 224 "#endif\n"); 225 #endif 226 fprintf(output_file, "\n"); 227 228 fprintf(output_file, "const uint8_t gzip_hdr[] = {\n" 229 "\t0x1f, 0x8b, 0x08, 0x00, 0x00,\n" "\t0x00, 0x00, 0x00, 0x00, 0xff\t};\n\n"); 230 231 fprintf(output_file, "const uint32_t gzip_hdr_bytes = %d;\n", GZIP_HEADER_SIZE); 232 fprintf(output_file, "const uint32_t gzip_trl_bytes = %d;\n\n", GZIP_TRAILER_SIZE); 233 234 fprintf(output_file, "const uint8_t zlib_hdr[] = { 0x78, 0x01 };\n\n"); 235 fprintf(output_file, "const uint32_t zlib_hdr_bytes = %d;\n", ZLIB_HEADER_SIZE); 236 fprintf(output_file, "const uint32_t zlib_trl_bytes = %d;\n", ZLIB_TRAILER_SIZE); 237 } 238 239 static uint32_t convert_dist_to_dist_sym(uint32_t dist) 240 { 241 assert(dist <= 32768 && dist > 0); 242 if (dist <= 32768) { 243 uint32_t msb = dist > 4 ? bsr(dist - 1) - 2 : 0; 244 return (msb * 2) + ((dist - 1) >> msb); 245 } else { 246 return ~0; 247 } 248 } 249 250 /** 251 * @brief Returns the deflate symbol value for a repeat length. 252 */ 253 static uint32_t convert_length_to_len_sym(uint32_t length) 254 { 255 assert(length > 2 && length < 259); 256 257 /* Based on tables on page 11 in RFC 1951 */ 258 if (length < 11) 259 return 257 + length - 3; 260 else if (length < 19) 261 return 261 + (length - 3) / 2; 262 else if (length < 35) 263 return 265 + (length - 3) / 4; 264 else if (length < 67) 265 return 269 + (length - 3) / 8; 266 else if (length < 131) 267 return 273 + (length - 3) / 16; 268 else if (length < 258) 269 return 277 + (length - 3) / 32; 270 else 271 return 285; 272 } 273 274 void isal_update_histogram_dict(uint8_t * start_stream, int dict_length, int length, 275 struct isal_huff_histogram *histogram) 276 { 277 uint32_t literal = 0, hash; 278 uint16_t seen, *last_seen = histogram->hash_table; 279 uint8_t *current, *end_stream, *next_hash, *end, *end_dict; 280 uint32_t match_length; 281 uint32_t dist; 282 uint64_t *lit_len_histogram = histogram->lit_len_histogram; 283 uint64_t *dist_histogram = histogram->dist_histogram; 284 285 if (length <= 0) 286 return; 287 288 end_stream = start_stream + dict_length + length; 289 end_dict = start_stream + dict_length; 290 291 memset(last_seen, 0, sizeof(histogram->hash_table)); /* Initialize last_seen to be 0. */ 292 293 for (current = start_stream; current < end_dict - 4; current++) { 294 literal = load_le_u32(current); 295 hash = compute_hash(literal) & LVL0_HASH_MASK; 296 last_seen[hash] = (current - start_stream) & 0xFFFF; 297 } 298 299 for (current = start_stream + dict_length; current < end_stream - 3; current++) { 300 literal = load_le_u32(current); 301 hash = compute_hash(literal) & LVL0_HASH_MASK; 302 seen = last_seen[hash]; 303 last_seen[hash] = (current - start_stream) & 0xFFFF; 304 dist = (current - start_stream - seen) & 0xFFFF; 305 if (dist - 1 < D - 1) { 306 assert(start_stream <= current - dist); 307 match_length = 308 compare258(current - dist, current, end_stream - current); 309 if (match_length >= SHORTEST_MATCH) { 310 next_hash = current; 311 #ifdef ISAL_LIMIT_HASH_UPDATE 312 end = next_hash + 3; 313 #else 314 end = next_hash + match_length; 315 #endif 316 if (end > end_stream - 3) 317 end = end_stream - 3; 318 next_hash++; 319 for (; next_hash < end; next_hash++) { 320 literal = load_le_u32(next_hash); 321 hash = compute_hash(literal) & LVL0_HASH_MASK; 322 last_seen[hash] = (next_hash - start_stream) & 0xFFFF; 323 } 324 325 dist_histogram[convert_dist_to_dist_sym(dist)] += 1; 326 lit_len_histogram[convert_length_to_len_sym(match_length)] += 327 1; 328 current += match_length - 1; 329 continue; 330 } 331 } 332 lit_len_histogram[literal & 0xFF] += 1; 333 } 334 335 for (; current < end_stream; current++) 336 lit_len_histogram[*current] += 1; 337 338 lit_len_histogram[256] += 1; 339 return; 340 } 341 342 int main(int argc, char *argv[]) 343 { 344 long int file_length; 345 int argi = 1; 346 uint8_t *stream = NULL; 347 struct isal_hufftables hufftables; 348 struct isal_huff_histogram histogram; 349 struct isal_zstream tmp_stream; 350 FILE *file = NULL; 351 FILE *dict_file = NULL; 352 FILE *hist_file = NULL; 353 long int dict_file_length = 0; 354 long int hist_file_length = 0; 355 uint8_t *dict_stream = NULL; 356 357 if (argc == 1) { 358 printf("Error, no input file.\n"); 359 return 1; 360 } 361 362 if (argc > 3 && argv[1][0] == '-' && argv[1][1] == 'd') { 363 dict_file = fopen(argv[2], "r"); 364 if (dict_file == NULL) { 365 printf("File \"%s\" open error!\n", argv[2]); 366 return 1; 367 } 368 369 fseek(dict_file, 0, SEEK_END); 370 dict_file_length = ftell(dict_file); 371 fseek(dict_file, 0, SEEK_SET); 372 dict_file_length -= ftell(dict_file); 373 dict_stream = malloc(dict_file_length); 374 if (dict_stream == NULL) { 375 printf("Failed to allocate memory to read in dictionary file\n"); 376 fclose(dict_file); 377 return 1; 378 } 379 if (fread(dict_stream, 1, dict_file_length, dict_file) != dict_file_length) { 380 printf("Error occurred when reading dictionary file"); 381 fclose(dict_file); 382 free(dict_stream); 383 return 1; 384 } 385 isal_update_histogram(dict_stream, dict_file_length, &histogram); 386 387 printf("Read %ld bytes of dictionary file %s\n", dict_file_length, argv[2]); 388 argi += 2; 389 fclose(dict_file); 390 free(dict_stream); 391 } 392 393 if ((argc > argi + 1) && argv[argi][0] == '-' && argv[argi][1] == 'h') { 394 hist_file = fopen(argv[argi + 1], "r+"); 395 if (hist_file == NULL) { 396 printf("File \"%s\" open error!\n", argv[argi + 1]); 397 return 1; 398 } 399 fseek(hist_file, 0, SEEK_END); 400 hist_file_length = ftell(hist_file); 401 fseek(hist_file, 0, SEEK_SET); 402 hist_file_length -= ftell(hist_file); 403 if (hist_file_length > sizeof(histogram)) { 404 printf("Histogram file too long\n"); 405 return 1; 406 } 407 if (fread(&histogram, 1, hist_file_length, hist_file) != hist_file_length) { 408 printf("Error occurred when reading history file"); 409 fclose(hist_file); 410 return 1; 411 } 412 fseek(hist_file, 0, SEEK_SET); 413 414 printf("Read %ld bytes of history file %s\n", hist_file_length, 415 argv[argi + 1]); 416 argi += 2; 417 } else 418 memset(&histogram, 0, sizeof(histogram)); /* Initialize histograms. */ 419 420 while (argi < argc) { 421 printf("Processing %s\n", argv[argi]); 422 file = fopen(argv[argi], "r"); 423 if (file == NULL) { 424 printf("Error opening file\n"); 425 return 1; 426 } 427 fseek(file, 0, SEEK_END); 428 file_length = ftell(file); 429 fseek(file, 0, SEEK_SET); 430 file_length -= ftell(file); 431 stream = malloc(file_length + dict_file_length); 432 if (stream == NULL) { 433 printf("Failed to allocate memory to read in file\n"); 434 fclose(file); 435 return 1; 436 } 437 if (dict_file_length > 0) 438 memcpy(stream, dict_stream, dict_file_length); 439 440 if (fread(&stream[dict_file_length], 1, file_length, file) != file_length) { 441 printf("Error occurred when reading file"); 442 fclose(file); 443 free(stream); 444 return 1; 445 } 446 447 /* Create a histogram of frequency of symbols found in stream to 448 * generate the huffman tree.*/ 449 if (0 == dict_file_length) 450 isal_update_histogram(stream, file_length, &histogram); 451 else 452 isal_update_histogram_dict(stream, dict_file_length, file_length, 453 &histogram); 454 455 fclose(file); 456 free(stream); 457 argi++; 458 } 459 460 isal_create_hufftables(&hufftables, &histogram); 461 462 file = fopen("hufftables_c.c", "w"); 463 if (file == NULL) { 464 printf("Error creating file hufftables_c.c\n"); 465 return 1; 466 } 467 468 fprint_header(file); 469 470 fprintf(file, "\n"); 471 472 fprint_hufftables(file, "hufftables_default", &hufftables); 473 474 fprintf(file, "\n"); 475 476 isal_deflate_stateless_init(&tmp_stream); 477 isal_deflate_set_hufftables(&tmp_stream, NULL, IGZIP_HUFFTABLE_STATIC); 478 fprint_hufftables(file, "hufftables_static", tmp_stream.hufftables); 479 480 fclose(file); 481 482 if (hist_file) { 483 int len = fwrite(&histogram, 1, sizeof(histogram), hist_file); 484 printf("wrote %d bytes of histogram file\n", len); 485 fclose(hist_file); 486 } 487 return 0; 488 } 489