xref: /plan9-contrib/sys/src/cmd/lzip/encoder_base.h (revision 13d37d7716a3e781f408392d7869dff5927c6669)
1 /*  Clzip - LZMA lossless data compressor
2     Copyright (C) 2010-2017 Antonio Diaz Diaz.
3 
4     This program is free software: you can redistribute it and/or modify
5     it under the terms of the GNU General Public License as published by
6     the Free Software Foundation, either version 2 of the License, or
7     (at your option) any later version.
8 
9     This program is distributed in the hope that it will be useful,
10     but WITHOUT ANY WARRANTY; without even the implied warranty of
11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12     GNU General Public License for more details.
13 
14     You should have received a copy of the GNU General Public License
15     along with this program.  If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include "lzip.h"
19 
20 static void
Dis_slots_init(void)21 Dis_slots_init(void)
22 {
23 	int	i, size, slot;
24 	for (slot = 0; slot < 4; ++slot)
25 		dis_slots[slot] = slot;
26 	for (i = 4, size = 2, slot = 4; slot < 20; slot += 2) {
27 		memset(&dis_slots[i], slot, size);
28 		memset(&dis_slots[i+size], slot + 1, size);
29 		size <<= 1;
30 		i += size;
31 	}
32 }
33 
34 static uchar
get_slot(unsigned dis)35 get_slot(unsigned dis)
36 {
37 	if (dis < (1 << 10))
38 		return dis_slots[dis];
39 	if (dis < (1 << 19))
40 		return dis_slots[dis>> 9] + 18;
41 	if (dis < (1 << 28))
42 		return dis_slots[dis>>18] + 36;
43 	return dis_slots[dis>>27] + 54;
44 }
45 
46 static void
Prob_prices_init(void)47 Prob_prices_init(void)
48 {
49 	int	i, j;
50 	for (i = 0; i < bit_model_total >> price_step_bits; ++i) {
51 		unsigned val = (i * price_step) + (price_step / 2);
52 		int bits = 0;			/* base 2 logarithm of val */
53 
54 		for (j = 0; j < price_shift_bits; ++j) {
55 			val = val * val;
56 			bits <<= 1;
57 			while (val >= (1 << 16)) {
58 				val >>= 1;
59 				++bits;
60 			}
61 		}
62 		bits += 15;			/* remaining bits in val */
63 		prob_prices[i] = (bit_model_total_bits << price_shift_bits) - bits;
64 	}
65 }
66 
67 static int
price_symbol3(Bit_model bm[],int symbol)68 price_symbol3(Bit_model bm[], int symbol)
69 {
70 	int price;
71 	bool bit = symbol & 1;
72 
73 	symbol |= 8;
74 	symbol >>= 1;
75 	price = price_bit(bm[symbol], bit);
76 	bit = symbol & 1;
77 	symbol >>= 1;
78 	price += price_bit(bm[symbol], bit);
79 	return price + price_bit(bm[1], symbol & 1);
80 }
81 
82 static int
price_symbol6(Bit_model bm[],unsigned symbol)83 price_symbol6(Bit_model bm[], unsigned symbol)
84 {
85 	int	price;
86 	bool bit = symbol & 1;
87 
88 	symbol |= 64;
89 	symbol >>= 1;
90 	price = price_bit(bm[symbol], bit);
91 	bit = symbol & 1;
92 	symbol >>= 1;
93 	price += price_bit(bm[symbol], bit);
94 	bit = symbol & 1;
95 	symbol >>= 1;
96 	price += price_bit(bm[symbol], bit);
97 	bit = symbol & 1;
98 	symbol >>= 1;
99 	price += price_bit(bm[symbol], bit);
100 	bit = symbol & 1;
101 	symbol >>= 1;
102 	price += price_bit(bm[symbol], bit);
103 	return price + price_bit(bm[1], symbol & 1);
104 }
105 
106 static int
price_symbol8(Bit_model bm[],int symbol)107 price_symbol8(Bit_model bm[], int symbol)
108 {
109 	int	price;
110 	bool bit = symbol & 1;
111 	symbol |= 0x100;
112 	symbol >>= 1;
113 	price = price_bit(bm[symbol], bit);
114 	bit = symbol & 1;
115 	symbol >>= 1;
116 	price += price_bit(bm[symbol], bit);
117 	bit = symbol & 1;
118 	symbol >>= 1;
119 	price += price_bit(bm[symbol], bit);
120 	bit = symbol & 1;
121 	symbol >>= 1;
122 	price += price_bit(bm[symbol], bit);
123 	bit = symbol & 1;
124 	symbol >>= 1;
125 	price += price_bit(bm[symbol], bit);
126 	bit = symbol & 1;
127 	symbol >>= 1;
128 	price += price_bit(bm[symbol], bit);
129 	bit = symbol & 1;
130 	symbol >>= 1;
131 	price += price_bit(bm[symbol], bit);
132 	return price + price_bit(bm[1], symbol & 1);
133 }
134 
135 static int
price_symbol_reversed(Bit_model bm[],int symbol,int num_bits)136 price_symbol_reversed(Bit_model bm[], int symbol, int num_bits)
137 {
138 	int	price = 0;
139 	int	model = 1;
140 	int	i;
141 
142 	for (i = num_bits; i > 0; --i) {
143 		bool bit = symbol & 1;
144 		symbol >>= 1;
145 		price += price_bit(bm[model], bit);
146 		model = (model << 1) | bit;
147 	}
148 	return price;
149 }
150 
151 static int
price_matched(Bit_model bm[],unsigned symbol,unsigned match_byte)152 price_matched(Bit_model bm[], unsigned symbol, unsigned match_byte)
153 {
154 	int price = 0;
155 	unsigned mask = 0x100;
156 
157 	symbol |= mask;
158 	for (;;) {
159 		unsigned match_bit = (match_byte <<= 1) & mask;
160 		bool bit = (symbol <<= 1) & 0x100;
161 
162 		price += price_bit(bm[(symbol>>9) + match_bit + mask], bit);
163 		if (symbol >= 0x10000)
164 			return price;
165 		mask &= ~(match_bit ^ symbol);
166 		/* if(match_bit != bit) mask = 0; */
167 	}
168 }
169 
170 struct Matchfinder_base {
171 	uvlong partial_data_pos;
172 	uchar * buffer;		/* input buffer */
173 	int32_t * prev_positions;	/* 1 + last seen position of key. else 0 */
174 	int32_t * pos_array;		/* may be tree or chain */
175 	int	before_size;		/* bytes to keep in buffer before dictionary */
176 	int	buffer_size;
177 	int	dict_size;	/* bytes to keep in buffer before pos */
178 	int	pos;			/* current pos in buffer */
179 	int	cyclic_pos;		/* cycles through [0, dict_size] */
180 	int	stream_pos;		/* first byte not yet read from file */
181 	int	pos_limit;		/* when reached, a new block must be read */
182 	int	key4_mask;
183 	int	num_prev_positions;	/* size of prev_positions */
184 	int	pos_array_size;
185 	int	infd;			/* input file descriptor */
186 	bool at_stream_end;		/* stream_pos shows real end of file */
187 };
188 
189 bool	Mb_read_block(Matchfinder_base *mb);
190 void	Mb_normalize_pos(Matchfinder_base *mb);
191 bool	Mb_init(Matchfinder_base *mb, int before, int dict_size, int after_size, int dict_factor, int num_prev_positions23, int pos_array_factor, int ifd);
192 
193 static void
Mb_free(Matchfinder_base * mb)194 Mb_free(Matchfinder_base *mb)
195 {
196 	free(mb->prev_positions);
197 	free(mb->buffer);
198 }
199 
200 static int
Mb_avail_bytes(Matchfinder_base * mb)201 Mb_avail_bytes(Matchfinder_base *mb)
202 {
203 	return mb->stream_pos - mb->pos;
204 }
205 
206 static uvlong
Mb_data_position(Matchfinder_base * mb)207 Mb_data_position(Matchfinder_base *mb)
208 {
209 	return mb->partial_data_pos + mb->pos;
210 }
211 
212 static bool
Mb_data_finished(Matchfinder_base * mb)213 Mb_data_finished(Matchfinder_base *mb)
214 {
215 	return mb->at_stream_end && mb->pos >= mb->stream_pos;
216 }
217 
218 static int
Mb_true_match_len(Matchfinder_base * mb,int index,int distance)219 Mb_true_match_len(Matchfinder_base *mb, int index, int distance)
220 {
221 	uchar * data = mb->buffer + mb->pos;
222 	int	i = index;
223 	int len_limit = min(Mb_avail_bytes(mb), max_match_len);
224 	while (i < len_limit && data[i-distance] == data[i])
225 		++i;
226 	return i;
227 }
228 
229 static void
Mb_move_pos(Matchfinder_base * mb)230 Mb_move_pos(Matchfinder_base *mb)
231 {
232 	if (++mb->cyclic_pos > mb->dict_size)
233 		mb->cyclic_pos = 0;
234 	if (++mb->pos >= mb->pos_limit)
235 		Mb_normalize_pos(mb);
236 }
237 
238 void	Mb_reset(Matchfinder_base *mb);
239 
240 enum {	re_buffer_size = 65536 };
241 
242 typedef struct LZ_encoder_base LZ_encoder_base;
243 typedef struct Matchfinder_base Matchfinder_base;
244 typedef struct Range_encoder Range_encoder;
245 
246 struct Range_encoder {
247 	uvlong low;
248 	uvlong partial_member_pos;
249 	uchar * buffer;		/* output buffer */
250 	int	pos;			/* current pos in buffer */
251 	uint32_t range;
252 	unsigned	ff_count;
253 	int	outfd;			/* output file descriptor */
254 	uchar cache;
255 	File_header header;
256 };
257 
258 void	Re_flush_data(Range_encoder *renc);
259 
260 static void
Re_put_byte(Range_encoder * renc,uchar b)261 Re_put_byte(Range_encoder *renc, uchar b)
262 {
263 	renc->buffer[renc->pos] = b;
264 	if (++renc->pos >= re_buffer_size)
265 		Re_flush_data(renc);
266 }
267 
268 static void
Re_shift_low(Range_encoder * renc)269 Re_shift_low(Range_encoder *renc)
270 {
271 	if (renc->low >> 24 != 0xFF) {
272 		bool carry = (renc->low > 0xFFFFFFFFU);
273 		Re_put_byte(renc, renc->cache + carry);
274 		for (; renc->ff_count > 0; --renc->ff_count)
275 			Re_put_byte(renc, 0xFF + carry);
276 		renc->cache = renc->low >> 24;
277 	} else
278 		++renc->ff_count;
279 	renc->low = (renc->low & 0x00FFFFFFU) << 8;
280 }
281 
282 static void
Re_reset(Range_encoder * renc)283 Re_reset(Range_encoder *renc)
284 {
285 	int	i;
286 	renc->low = 0;
287 	renc->partial_member_pos = 0;
288 	renc->pos = 0;
289 	renc->range = 0xFFFFFFFFU;
290 	renc->ff_count = 0;
291 	renc->cache = 0;
292 	for (i = 0; i < Fh_size; ++i)
293 		Re_put_byte(renc, renc->header[i]);
294 }
295 
296 static bool
Re_init(Range_encoder * renc,unsigned dict_size,int ofd)297 Re_init(Range_encoder *renc, unsigned dict_size, int ofd)
298 {
299 	renc->buffer = (uchar *)malloc(re_buffer_size);
300 	if (!renc->buffer)
301 		return false;
302 	renc->outfd = ofd;
303 	Fh_set_magic(renc->header);
304 	Fh_set_dict_size(renc->header, dict_size);
305 	Re_reset(renc);
306 	return true;
307 }
308 
309 static void
Re_free(Range_encoder * renc)310 Re_free(Range_encoder *renc)
311 {
312 	free(renc->buffer);
313 }
314 
315 static uvlong
Re_member_position(Range_encoder * renc)316 Re_member_position(Range_encoder *renc)
317 {
318 	return renc->partial_member_pos + renc->pos + renc->ff_count;
319 }
320 
321 static void
Re_flush(Range_encoder * renc)322 Re_flush(Range_encoder *renc)
323 {
324 	int	i;
325 	for (i = 0; i < 5; ++i)
326 		Re_shift_low(renc);
327 }
328 
329 static void
Re_encode(Range_encoder * renc,int symbol,int num_bits)330 Re_encode(Range_encoder *renc, int symbol, int num_bits)
331 {
332 	unsigned	mask;
333 	for (mask = 1 << (num_bits - 1); mask > 0; mask >>= 1) {
334 		renc->range >>= 1;
335 		if (symbol & mask)
336 			renc->low += renc->range;
337 		if (renc->range <= 0x00FFFFFFU) {
338 			renc->range <<= 8;
339 			Re_shift_low(renc);
340 		}
341 	}
342 }
343 
344 static void
Re_encode_bit(Range_encoder * renc,Bit_model * probability,bool bit)345 Re_encode_bit(Range_encoder *renc, Bit_model *probability, bool bit)
346 {
347 	Bit_model prob = *probability;
348 	uint32_t bound = (renc->range >> bit_model_total_bits) * prob;
349 
350 	if (!bit) {
351 		renc->range = bound;
352 		*probability += (bit_model_total - prob) >> bit_model_move_bits;
353 	} else {
354 		renc->low += bound;
355 		renc->range -= bound;
356 		*probability -= prob >> bit_model_move_bits;
357 	}
358 	if (renc->range <= 0x00FFFFFFU) {
359 		renc->range <<= 8;
360 		Re_shift_low(renc);
361 	}
362 }
363 
364 static void
Re_encode_tree3(Range_encoder * renc,Bit_model bm[],int symbol)365 Re_encode_tree3(Range_encoder *renc, Bit_model bm[], int symbol)
366 {
367 	int model = 1;
368 	bool bit = (symbol >> 2) & 1;
369 
370 	Re_encode_bit(renc, &bm[model], bit);
371 	model = (model << 1) | bit;
372 	bit = (symbol >> 1) & 1;
373 	Re_encode_bit(renc, &bm[model], bit);
374 	model = (model << 1) | bit;
375 	Re_encode_bit(renc, &bm[model], symbol & 1);
376 }
377 
378 static void
Re_encode_tree6(Range_encoder * renc,Bit_model bm[],unsigned symbol)379 Re_encode_tree6(Range_encoder *renc, Bit_model bm[], unsigned symbol)
380 {
381 	int	model = 1;
382 	bool bit = (symbol >> 5) & 1;
383 	Re_encode_bit(renc, &bm[model], bit);
384 	model = (model << 1) | bit;
385 	bit = (symbol >> 4) & 1;
386 	Re_encode_bit(renc, &bm[model], bit);
387 	model = (model << 1) | bit;
388 	bit = (symbol >> 3) & 1;
389 	Re_encode_bit(renc, &bm[model], bit);
390 	model = (model << 1) | bit;
391 	bit = (symbol >> 2) & 1;
392 	Re_encode_bit(renc, &bm[model], bit);
393 	model = (model << 1) | bit;
394 	bit = (symbol >> 1) & 1;
395 	Re_encode_bit(renc, &bm[model], bit);
396 	model = (model << 1) | bit;
397 	Re_encode_bit(renc, &bm[model], symbol & 1);
398 }
399 
400 static void
Re_encode_tree8(Range_encoder * renc,Bit_model bm[],int symbol)401 Re_encode_tree8(Range_encoder *renc, Bit_model bm[], int symbol)
402 {
403 	int	model = 1;
404 	int	i;
405 	for (i = 7; i >= 0; --i) {
406 		bool bit = (symbol >> i) & 1;
407 		Re_encode_bit(renc, &bm[model], bit);
408 		model = (model << 1) | bit;
409 	}
410 }
411 
412 static void
Re_encode_tree_reversed(Range_encoder * renc,Bit_model bm[],int symbol,int num_bits)413 Re_encode_tree_reversed(Range_encoder *renc, Bit_model bm[], int symbol, int num_bits)
414 {
415 	int	model = 1;
416 	int	i;
417 	for (i = num_bits; i > 0; --i) {
418 		bool bit = symbol & 1;
419 		symbol >>= 1;
420 		Re_encode_bit(renc, &bm[model], bit);
421 		model = (model << 1) | bit;
422 	}
423 }
424 
425 static void
Re_encode_matched(Range_encoder * renc,Bit_model bm[],unsigned symbol,unsigned match_byte)426 Re_encode_matched(Range_encoder *renc, Bit_model bm[], unsigned symbol, unsigned match_byte)
427 {
428 	unsigned	mask = 0x100;
429 	symbol |= mask;
430 	while (true) {
431 		unsigned match_bit = (match_byte <<= 1) & mask;
432 		bool bit = (symbol <<= 1) & 0x100;
433 		Re_encode_bit(renc, &bm[(symbol>>9)+match_bit+mask], bit);
434 		if (symbol >= 0x10000)
435 			break;
436 		mask &= ~(match_bit ^ symbol);
437 		/* if(match_bit != bit) mask = 0; */
438 	}
439 }
440 
441 static void
Re_encode_len(struct Range_encoder * renc,Len_model * lm,int symbol,int pos_state)442 Re_encode_len(struct Range_encoder *renc, Len_model *lm, int symbol, int pos_state)
443 {
444 	bool bit = ((symbol -= min_match_len) >= len_low_syms);
445 	Re_encode_bit(renc, &lm->choice1, bit);
446 	if (!bit)
447 		Re_encode_tree3(renc, lm->bm_low[pos_state], symbol);
448 	else {
449 		bit = ((symbol -= len_low_syms) >= len_mid_syms);
450 		Re_encode_bit(renc, &lm->choice2, bit);
451 		if (!bit)
452 			Re_encode_tree3(renc, lm->bm_mid[pos_state], symbol);
453 		else
454 			Re_encode_tree8(renc, lm->bm_high, symbol - len_mid_syms);
455 	}
456 }
457 
458 enum {
459 	max_marker_size = 16,
460 	num_rep_distances = 4		/* must be 4 */
461 };
462 
463 struct LZ_encoder_base {
464 	struct Matchfinder_base mb;
465 	uint32_t crc;
466 
467 	Bit_model bm_literal[1<<literal_context_bits][0x300];
468 	Bit_model bm_match[states][pos_states];
469 	Bit_model bm_rep[states];
470 	Bit_model bm_rep0[states];
471 	Bit_model bm_rep1[states];
472 	Bit_model bm_rep2[states];
473 	Bit_model bm_len[states][pos_states];
474 	Bit_model bm_dis_slot[len_states][1<<dis_slot_bits];
475 	Bit_model bm_dis[modeled_distances-end_dis_model+1];
476 	Bit_model bm_align[dis_align_size];
477 	struct Len_model match_len_model;
478 	struct Len_model rep_len_model;
479 	struct Range_encoder renc;
480 };
481 
482 void	LZeb_reset(LZ_encoder_base *eb);
483 
484 static bool
LZeb_init(LZ_encoder_base * eb,int before,int dict_size,int after_size,int dict_factor,int num_prev_positions23,int pos_array_factor,int ifd,int outfd)485 LZeb_init(LZ_encoder_base *eb, int before, int dict_size, int after_size, int dict_factor, int num_prev_positions23, int pos_array_factor, int ifd, int outfd)
486 {
487 	if (!Mb_init(&eb->mb, before, dict_size, after_size, dict_factor,
488 	    num_prev_positions23, pos_array_factor, ifd))
489 		return false;
490 	if (!Re_init(&eb->renc, eb->mb.dict_size, outfd))
491 		return false;
492 	LZeb_reset(eb);
493 	return true;
494 }
495 
496 static void
LZeb_free(LZ_encoder_base * eb)497 LZeb_free(LZ_encoder_base *eb)
498 {
499 	Re_free(&eb->renc);
500 	Mb_free(&eb->mb);
501 }
502 
503 static unsigned
LZeb_crc(LZ_encoder_base * eb)504 LZeb_crc(LZ_encoder_base *eb)
505 {
506 	return eb->crc ^ 0xFFFFFFFFU;
507 }
508 
509 static int
LZeb_price_literal(LZ_encoder_base * eb,uchar prev_byte,uchar symbol)510 LZeb_price_literal(LZ_encoder_base *eb, uchar prev_byte, uchar symbol)
511 {
512 	return price_symbol8(eb->bm_literal[get_lit_state(prev_byte)], symbol);
513 }
514 
515 static int
LZeb_price_matched(LZ_encoder_base * eb,uchar prev_byte,uchar symbol,uchar match_byte)516 LZeb_price_matched(LZ_encoder_base *eb, uchar prev_byte, uchar symbol, uchar match_byte)
517 {
518 	return price_matched(eb->bm_literal[get_lit_state(prev_byte)], symbol,
519 	    match_byte);
520 }
521 
522 static void
LZeb_encode_literal(LZ_encoder_base * eb,uchar prev_byte,uchar symbol)523 LZeb_encode_literal(LZ_encoder_base *eb, uchar prev_byte, uchar symbol)
524 {
525 	Re_encode_tree8(&eb->renc, eb->bm_literal[get_lit_state(prev_byte)],
526 	    symbol);
527 }
528 
529 static void
LZeb_encode_matched(LZ_encoder_base * eb,uchar prev_byte,uchar symbol,uchar match_byte)530 LZeb_encode_matched(LZ_encoder_base *eb, uchar prev_byte, uchar symbol, uchar match_byte)
531 {
532 	Re_encode_matched(&eb->renc, eb->bm_literal[get_lit_state(prev_byte)],
533 	    symbol, match_byte);
534 }
535 
536 static void
LZeb_encode_pair(LZ_encoder_base * eb,unsigned dis,int len,int pos_state)537 LZeb_encode_pair(LZ_encoder_base *eb, unsigned dis, int len, int pos_state)
538 {
539 	unsigned dis_slot = get_slot(dis);
540 	Re_encode_len(&eb->renc, &eb->match_len_model, len, pos_state);
541 	Re_encode_tree6(&eb->renc, eb->bm_dis_slot[get_len_state(len)], dis_slot);
542 
543 	if (dis_slot >= start_dis_model) {
544 		int direct_bits = (dis_slot >> 1) - 1;
545 		unsigned base = (2 | (dis_slot & 1)) << direct_bits;
546 		unsigned direct_dis = dis - base;
547 
548 		if (dis_slot < end_dis_model)
549 			Re_encode_tree_reversed(&eb->renc, eb->bm_dis + (base - dis_slot),
550 			    direct_dis, direct_bits);
551 		else {
552 			Re_encode(&eb->renc, direct_dis >> dis_align_bits,
553 			    direct_bits - dis_align_bits);
554 			Re_encode_tree_reversed(&eb->renc, eb->bm_align, direct_dis, dis_align_bits);
555 		}
556 	}
557 }
558 
559 void	LZeb_full_flush(LZ_encoder_base *eb, State state);
560