xref: /plan9-contrib/sys/src/cmd/lzip/lzip.h (revision 13d37d7716a3e781f408392d7869dff5927c6669)
1 /*  Clzip - LZMA lossless data compressor
2     Copyright (C) 2010-2017 Antonio Diaz Diaz.
3 
4     This program is free software: you can redistribute it and/or modify
5     it under the terms of the GNU General Public License as published by
6     the Free Software Foundation, either version 2 of the License, or
7     (at your option) any later version.
8 
9     This program is distributed in the hope that it will be useful,
10     but WITHOUT ANY WARRANTY; without even the implied warranty of
11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12     GNU General Public License for more details.
13 
14     You should have received a copy of the GNU General Public License
15     along with this program.  If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #ifndef _LZIP_H
19 #define _LZIP_H
20 
21 #include <u.h>
22 #include <libc.h>
23 #include <stdio.h>
24 #include <ctype.h>
25 
26 #define exit(n) exits((n) == 0? 0: "err")
27 #define isatty(fd) 0
28 #define lseek seek
29 
30 #ifndef max
31 #define max(x,y) ((x) >= (y) ? (x) : (y))
32 #endif
33 #ifndef min
34 #define min(x,y) ((x) <= (y) ? (x) : (y))
35 #endif
36 
37 typedef int	State;
38 typedef long int32_t;
39 typedef ulong uint32_t;
40 typedef int bool;
41 
42 enum { false, true };
43 
44 enum { states = 12 };
45 enum {
46 	min_dict_bits = 12,
47 	min_dict_size = 1 << min_dict_bits,	/* >= modeled_distances */
48 	max_dict_bits = 29,
49 	max_dict_size = 1 << max_dict_bits,
50 	min_member_size = 36,
51 	literal_context_bits = 3,
52 	literal_pos_state_bits = 0, 			/* not used */
53 	pos_state_bits = 2,
54 	pos_states = 1 << pos_state_bits,
55 	pos_state_mask = pos_states -1,
56 
57 	len_states = 4,
58 	dis_slot_bits = 6,
59 	start_dis_model = 4,
60 	end_dis_model = 14,
61 	modeled_distances = 1 << (end_dis_model / 2), 	/* 128 */
62 	dis_align_bits = 4,
63 	dis_align_size = 1 << dis_align_bits,
64 
65 	len_low_bits = 3,
66 	len_mid_bits = 3,
67 	len_high_bits = 8,
68 	len_low_syms = 1 << len_low_bits,
69 	len_mid_syms = 1 << len_mid_bits,
70 	len_high_syms = 1 << len_high_bits,
71 	max_len_syms = len_low_syms + len_mid_syms + len_high_syms,
72 
73 	min_match_len = 2, 					/* must be 2 */
74 	max_match_len = min_match_len + max_len_syms - 1, 	/* 273 */
75 	min_match_len_limit = 5,
76 
77 	bit_model_move_bits = 5,
78 	bit_model_total_bits = 11,
79 	bit_model_total = 1 << bit_model_total_bits,
80 };
81 
82 typedef struct Len_model Len_model;
83 typedef struct Pretty_print Pretty_print;
84 typedef struct Matchfinder_base Matchfinder_base;
85 typedef int	Bit_model;
86 
87 struct Len_model {
88 	Bit_model choice1;
89 	Bit_model choice2;
90 	Bit_model bm_low[pos_states][len_low_syms];
91 	Bit_model bm_mid[pos_states][len_mid_syms];
92 	Bit_model bm_high[len_high_syms];
93 };
94 struct Pretty_print {
95 	char	*name;
96 	char	*stdin_name;
97 	ulong	longest_name;
98 	bool	first_post;
99 };
100 
101 typedef ulong CRC32[256];	/* Table of CRCs of all 8-bit messages. */
102 
103 extern CRC32 crc32;
104 
105 #define errno 0
106 
107 static uchar magic_string[4] = { "LZIP" };
108 
109 typedef uchar File_header[6];		/* 0-3 magic bytes */
110 /*   4 version */
111 /*   5 coded_dict_size */
112 enum { Fh_size = 6 };
113 
114 typedef uchar File_trailer[20];
115 /*  0-3  CRC32 of the uncompressed data */
116 /*  4-11 size of the uncompressed data */
117 /* 12-19 member size including header and trailer */
118 
119 enum { Ft_size = 20 };
120 
121 enum {
122 	price_shift_bits = 6,
123 	price_step_bits = 2,
124 	price_step = 1 << price_step_bits,
125 };
126 
127 typedef uchar Dis_slots[1<<10];
128 typedef short	Prob_prices[bit_model_total >> price_step_bits];
129 
130 extern Dis_slots dis_slots;
131 extern Prob_prices prob_prices;
132 
133 #define get_price(prob)		prob_prices[(prob) >> price_step_bits]
134 #define price0(prob)		get_price(prob)
135 #define price1(prob)		get_price(bit_model_total - (prob))
136 #define price_bit(bm, bit)	((bit)? price1(bm): price0(bm))
137 
138 #define Mb_ptr_to_current_pos(mb) ((mb)->buffer + (mb)->pos)
139 #define Mb_peek(mb, distance) (mb)->buffer[(mb)->pos - (distance)]
140 
141 #define Lp_price(lp, len, pos_state) \
142 	(lp)->prices[pos_state][(len) - min_match_len]
143 
144 #define Tr_update(trial, pr, distance4, p_i) \
145 { \
146 	if ((pr) < (trial)->price) { \
147 		(trial)->price = pr; \
148 		(trial)->dis4 = distance4; \
149 		(trial)->prev_index = p_i; \
150 		(trial)->prev_index2 = single_step_trial; \
151 	} else { \
152 	} \
153 }
154 
155 /* these functions are now extern and must be defined exactly once */
156 #ifdef	_DEFINE_INLINES
157 #define	_INLINES_DEFINED
158 
159 int
get_len_state(int len)160 get_len_state(int len)
161 {
162 	int lenstm1, lenmmm;
163 
164 	lenmmm = len - min_match_len;
165 	lenstm1 = len_states - 1;
166 	if (lenmmm < lenstm1)
167 		return lenmmm;
168 	else
169 		return lenstm1;
170 }
171 
172 State
St_set_char(State st)173 St_set_char(State st)
174 {
175 	static State next[states] = { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5 };
176 
177 	assert((unsigned)st < nelem(next));
178 	return next[st];
179 }
180 
181 int
get_lit_state(uchar prev_byte)182 get_lit_state(uchar prev_byte)
183 {
184 	return prev_byte >> (8 - literal_context_bits);
185 }
186 
187 void
Bm_init(Bit_model * probability)188 Bm_init(Bit_model *probability)
189 {
190 	*probability = bit_model_total / 2;
191 }
192 
193 void
Bm_array_init(Bit_model bm[],int size)194 Bm_array_init(Bit_model bm[], int size)
195 {
196 	int i;
197 
198 	for (i = 0; i < size; ++i)
199 		Bm_init(&bm[i]);
200 }
201 
202 void
Lm_init(Len_model * lm)203 Lm_init(Len_model *lm)
204 {
205 	Bm_init(&lm->choice1);
206 	Bm_init(&lm->choice2);
207 	Bm_array_init(lm->bm_low[0], pos_states * len_low_syms);
208 	Bm_array_init(lm->bm_mid[0], pos_states * len_mid_syms);
209 	Bm_array_init(lm->bm_high, len_high_syms);
210 }
211 
212 void
Pp_init(Pretty_print * pp,char * filenames[],int num_filenames,int verbosity)213 Pp_init(Pretty_print *pp, char *filenames[], int num_filenames, int verbosity)
214 {
215 	unsigned stdin_name_len;
216 	int i;
217 
218 	pp->name = 0;
219 	pp->stdin_name = "(stdin)";
220 	pp->longest_name = 0;
221 	pp->first_post = false;
222 
223 	if (verbosity <= 0)
224 		return;
225 	stdin_name_len = strlen(pp->stdin_name);
226 	for (i = 0; i < num_filenames; ++i) {
227 		char *s = filenames[i];
228 		unsigned len = strcmp(s, "-") == 0? stdin_name_len: strlen(s);
229 
230 		if (len > pp->longest_name)
231 			pp->longest_name = len;
232 	}
233 	if (pp->longest_name == 0)
234 		pp->longest_name = stdin_name_len;
235 }
236 
237 void
Pp_set_name(Pretty_print * pp,char * filename)238 Pp_set_name(Pretty_print *pp, char *filename)
239 {
240 	if ( filename && filename[0] && strcmp( filename, "-" ) != 0 )
241 		pp->name = filename;
242 	else
243 		pp->name = pp->stdin_name;
244 	pp->first_post = true;
245 }
246 
247 void
Pp_reset(Pretty_print * pp)248 Pp_reset(Pretty_print *pp)
249 {
250 	if (pp->name && pp->name[0])
251 		pp->first_post = true;
252 }
253 
254 void
255 Pp_show_msg(Pretty_print *pp, char *msg);
256 
257 void
CRC32_init(void)258 CRC32_init(void)
259 {
260 	unsigned n;
261 
262 	for (n = 0; n < 256; ++n) {
263 		unsigned	c = n;
264 		int	k;
265 		for (k = 0; k < 8; ++k) {
266 			if (c & 1)
267 				c = 0xEDB88320U ^ (c >> 1);
268 			else
269 				c >>= 1;
270 		}
271 		crc32[n] = c;
272 	}
273 }
274 
275 void
CRC32_update_byte(uint32_t * crc,uchar byte)276 CRC32_update_byte(uint32_t *crc, uchar byte)
277 {
278 	*crc = crc32[(*crc^byte)&0xFF] ^ (*crc >> 8);
279 }
280 
281 void
CRC32_update_buf(uint32_t * crc,uchar * buffer,int size)282 CRC32_update_buf(uint32_t *crc, uchar *buffer, int size)
283 {
284 	int	i;
285 	uint32_t c = *crc;
286 	for (i = 0; i < size; ++i)
287 		c = crc32[(c^buffer[i])&0xFF] ^ (c >> 8);
288 	*crc = c;
289 }
290 
291 bool
isvalid_ds(unsigned dict_size)292 isvalid_ds(unsigned dict_size)
293 {
294 	return (dict_size >= min_dict_size &&
295 	    dict_size <= max_dict_size);
296 }
297 
298 int
real_bits(unsigned value)299 real_bits(unsigned value)
300 {
301 	int	bits = 0;
302 
303 	while (value > 0) {
304 		value >>= 1;
305 		++bits;
306 	}
307 	return bits;
308 }
309 
310 void
Fh_set_magic(File_header data)311 Fh_set_magic(File_header data)
312 {
313 	memcpy(data, magic_string, 4);
314 	data[4] = 1;
315 }
316 
317 bool
Fh_verify_magic(File_header data)318 Fh_verify_magic(File_header data)
319 {
320 	return (memcmp(data, magic_string, 4) == 0);
321 }
322 
323 /* detect truncated header */
324 bool
Fh_verify_prefix(File_header data,int size)325 Fh_verify_prefix(File_header data, int size)
326 {
327 	int	i;
328 	for (i = 0; i < size && i < 4; ++i)
329 		if (data[i] != magic_string[i])
330 			return false;
331 	return (size > 0);
332 }
333 
334 uchar
Fh_version(File_header data)335 Fh_version(File_header data)
336 {
337 	return data[4];
338 }
339 
340 bool
Fh_verify_version(File_header data)341 Fh_verify_version(File_header data)
342 {
343 	return (data[4] == 1);
344 }
345 
346 unsigned
Fh_get_dict_size(File_header data)347 Fh_get_dict_size(File_header data)
348 {
349 	unsigned	sz = (1 << (data[5] &0x1F));
350 	if (sz > min_dict_size)
351 		sz -= (sz / 16) * ((data[5] >> 5) & 7);
352 	return sz;
353 }
354 
355 bool
Fh_set_dict_size(File_header data,unsigned sz)356 Fh_set_dict_size(File_header data, unsigned sz)
357 {
358 	if (!isvalid_ds(sz))
359 		return false;
360 	data[5] = real_bits(sz - 1);
361 	if (sz > min_dict_size) {
362 		unsigned base_size = 1 << data[5];
363 		unsigned fraction = base_size / 16;
364 		unsigned	i;
365 		for (i = 7; i >= 1; --i)
366 			if (base_size - (i * fraction) >= sz) {
367 				data[5] |= (i << 5);
368 				break;
369 			}
370 	}
371 	return true;
372 }
373 
374 unsigned
Ft_get_data_crc(File_trailer data)375 Ft_get_data_crc(File_trailer data)
376 {
377 	unsigned	tmp = 0;
378 	int	i;
379 	for (i = 3; i >= 0; --i) {
380 		tmp <<= 8;
381 		tmp += data[i];
382 	}
383 	return tmp;
384 }
385 
386 void
Ft_set_data_crc(File_trailer data,unsigned crc)387 Ft_set_data_crc(File_trailer data, unsigned crc)
388 {
389 	int	i;
390 	for (i = 0; i <= 3; ++i) {
391 		data[i] = (uchar)crc;
392 		crc >>= 8;
393 	}
394 }
395 
396 uvlong
Ft_get_data_size(File_trailer data)397 Ft_get_data_size(File_trailer data)
398 {
399 	uvlong	tmp = 0;
400 	int	i;
401 	for (i = 11; i >= 4; --i) {
402 		tmp <<= 8;
403 		tmp += data[i];
404 	}
405 	return tmp;
406 }
407 
408 void
Ft_set_data_size(File_trailer data,uvlong sz)409 Ft_set_data_size(File_trailer data, uvlong sz)
410 {
411 	int	i;
412 	for (i = 4; i <= 11; ++i) {
413 		data[i] = (uchar)sz;
414 		sz >>= 8;
415 	}
416 }
417 
418 uvlong
Ft_get_member_size(File_trailer data)419 Ft_get_member_size(File_trailer data)
420 {
421 	uvlong	tmp = 0;
422 	int	i;
423 	for (i = 19; i >= 12; --i) {
424 		tmp <<= 8;
425 		tmp += data[i];
426 	}
427 	return tmp;
428 }
429 
430 void
Ft_set_member_size(File_trailer data,uvlong sz)431 Ft_set_member_size(File_trailer data, uvlong sz)
432 {
433 	int	i;
434 	for (i = 12; i <= 19; ++i) {
435 		data[i] = (uchar)sz;
436 		sz >>= 8;
437 	}
438 }
439 #else					/* _DEFINE_INLINES */
440 void	Bm_array_init(Bit_model bm[], int size);
441 void	Bm_init(Bit_model *probability);
442 void	CRC32_init(void);
443 void	CRC32_update_buf(uint32_t *crc, uchar *buffer, int size);
444 void	CRC32_update_byte(uint32_t *crc, uchar byte);
445 unsigned	Fh_get_dict_size(File_header data);
446 bool	Fh_set_dict_size(File_header data, unsigned sz);
447 void	Fh_set_magic(File_header data);
448 bool	Fh_verify_magic(File_header data);
449 bool	Fh_verify_prefix(File_header data, int size);
450 bool	Fh_verify_version(File_header data);
451 uchar	Fh_version(File_header data);
452 unsigned	Ft_get_data_crc(File_trailer data);
453 uvlong	Ft_get_data_size(File_trailer data);
454 uvlong	Ft_get_member_size(File_trailer data);
455 void	Ft_set_data_crc(File_trailer data, unsigned crc);
456 void	Ft_set_data_size(File_trailer data, uvlong sz);
457 void	Ft_set_member_size(File_trailer data, uvlong sz);
458 void	Lm_init(Len_model *lm);
459 void	Pp_init(Pretty_print *pp, char *filenames[], int num_filenames, int verbosity);
460 void	Pp_reset(Pretty_print *pp);
461 void	Pp_set_name(Pretty_print *pp, char *filename);
462 void	Pp_show_msg(Pretty_print *pp, char *msg);
463 State	St_set_char(State st);
464 int	get_lit_state(uchar prev_byte);
465 int	get_len_state(int len);
466 bool	isvalid_ds(unsigned dict_size);
467 int	real_bits(unsigned value);
468 #endif					/* _DEFINE_INLINES */
469 
470 #define St_is_char(state)	((state) < 7)
471 #define St_set_match(state)	((state) < 7? 7: 10)
472 #define St_set_rep(state)	((state) < 7? 8: 11)
473 #define St_set_short_rep(state)	((state) < 7? 9: 11)
474 
475 static char *bad_magic_msg = "Bad magic number (file not in lzip format).";
476 static char *bad_dict_msg = "Invalid dictionary size in member header.";
477 static char *trailing_msg = "Trailing data not allowed.";
478 
479 /* defined in decoder.c */
480 int	readblock(int fd, uchar *buf, int size);
481 int	writeblock(int fd, uchar *buf, int size);
482 
483 /* defined in main.c */
484 extern int verbosity;
485 Dir;
486 char	*bad_version(unsigned version);
487 char	*format_ds(unsigned dict_size);
488 int	open_instream(char *name, Dir *in_statsp, bool no_ofile, bool reg_only);
489 void	*resize_buffer(void *buf, unsigned min_size);
490 void	cleanup_and_fail(int retval);
491 void	show_error(char *msg, int errcode, bool help);
492 void	show_file_error(char *filename, char *msg, int errcode);
493 void	internal_error(char *msg);
494 struct Matchfinder_base;
495 void	show_progress(uvlong partial_size, Matchfinder_base *m, Pretty_print *p,
496 	uvlong cfile_size);
497 #endif
498