1 /* Clzip - LZMA lossless data compressor
2 Copyright (C) 2010-2017 Antonio Diaz Diaz.
3
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18 #include "lzip.h"
19
20 static void
Dis_slots_init(void)21 Dis_slots_init(void)
22 {
23 int i, size, slot;
24 for (slot = 0; slot < 4; ++slot)
25 dis_slots[slot] = slot;
26 for (i = 4, size = 2, slot = 4; slot < 20; slot += 2) {
27 memset(&dis_slots[i], slot, size);
28 memset(&dis_slots[i+size], slot + 1, size);
29 size <<= 1;
30 i += size;
31 }
32 }
33
34 static uchar
get_slot(unsigned dis)35 get_slot(unsigned dis)
36 {
37 if (dis < (1 << 10))
38 return dis_slots[dis];
39 if (dis < (1 << 19))
40 return dis_slots[dis>> 9] + 18;
41 if (dis < (1 << 28))
42 return dis_slots[dis>>18] + 36;
43 return dis_slots[dis>>27] + 54;
44 }
45
46 static void
Prob_prices_init(void)47 Prob_prices_init(void)
48 {
49 int i, j;
50 for (i = 0; i < bit_model_total >> price_step_bits; ++i) {
51 unsigned val = (i * price_step) + (price_step / 2);
52 int bits = 0; /* base 2 logarithm of val */
53
54 for (j = 0; j < price_shift_bits; ++j) {
55 val = val * val;
56 bits <<= 1;
57 while (val >= (1 << 16)) {
58 val >>= 1;
59 ++bits;
60 }
61 }
62 bits += 15; /* remaining bits in val */
63 prob_prices[i] = (bit_model_total_bits << price_shift_bits) - bits;
64 }
65 }
66
67 static int
price_symbol3(Bit_model bm[],int symbol)68 price_symbol3(Bit_model bm[], int symbol)
69 {
70 int price;
71 bool bit = symbol & 1;
72
73 symbol |= 8;
74 symbol >>= 1;
75 price = price_bit(bm[symbol], bit);
76 bit = symbol & 1;
77 symbol >>= 1;
78 price += price_bit(bm[symbol], bit);
79 return price + price_bit(bm[1], symbol & 1);
80 }
81
82 static int
price_symbol6(Bit_model bm[],unsigned symbol)83 price_symbol6(Bit_model bm[], unsigned symbol)
84 {
85 int price;
86 bool bit = symbol & 1;
87
88 symbol |= 64;
89 symbol >>= 1;
90 price = price_bit(bm[symbol], bit);
91 bit = symbol & 1;
92 symbol >>= 1;
93 price += price_bit(bm[symbol], bit);
94 bit = symbol & 1;
95 symbol >>= 1;
96 price += price_bit(bm[symbol], bit);
97 bit = symbol & 1;
98 symbol >>= 1;
99 price += price_bit(bm[symbol], bit);
100 bit = symbol & 1;
101 symbol >>= 1;
102 price += price_bit(bm[symbol], bit);
103 return price + price_bit(bm[1], symbol & 1);
104 }
105
106 static int
price_symbol8(Bit_model bm[],int symbol)107 price_symbol8(Bit_model bm[], int symbol)
108 {
109 int price;
110 bool bit = symbol & 1;
111 symbol |= 0x100;
112 symbol >>= 1;
113 price = price_bit(bm[symbol], bit);
114 bit = symbol & 1;
115 symbol >>= 1;
116 price += price_bit(bm[symbol], bit);
117 bit = symbol & 1;
118 symbol >>= 1;
119 price += price_bit(bm[symbol], bit);
120 bit = symbol & 1;
121 symbol >>= 1;
122 price += price_bit(bm[symbol], bit);
123 bit = symbol & 1;
124 symbol >>= 1;
125 price += price_bit(bm[symbol], bit);
126 bit = symbol & 1;
127 symbol >>= 1;
128 price += price_bit(bm[symbol], bit);
129 bit = symbol & 1;
130 symbol >>= 1;
131 price += price_bit(bm[symbol], bit);
132 return price + price_bit(bm[1], symbol & 1);
133 }
134
135 static int
price_symbol_reversed(Bit_model bm[],int symbol,int num_bits)136 price_symbol_reversed(Bit_model bm[], int symbol, int num_bits)
137 {
138 int price = 0;
139 int model = 1;
140 int i;
141
142 for (i = num_bits; i > 0; --i) {
143 bool bit = symbol & 1;
144 symbol >>= 1;
145 price += price_bit(bm[model], bit);
146 model = (model << 1) | bit;
147 }
148 return price;
149 }
150
151 static int
price_matched(Bit_model bm[],unsigned symbol,unsigned match_byte)152 price_matched(Bit_model bm[], unsigned symbol, unsigned match_byte)
153 {
154 int price = 0;
155 unsigned mask = 0x100;
156
157 symbol |= mask;
158 for (;;) {
159 unsigned match_bit = (match_byte <<= 1) & mask;
160 bool bit = (symbol <<= 1) & 0x100;
161
162 price += price_bit(bm[(symbol>>9) + match_bit + mask], bit);
163 if (symbol >= 0x10000)
164 return price;
165 mask &= ~(match_bit ^ symbol);
166 /* if(match_bit != bit) mask = 0; */
167 }
168 }
169
170 struct Matchfinder_base {
171 uvlong partial_data_pos;
172 uchar * buffer; /* input buffer */
173 int32_t * prev_positions; /* 1 + last seen position of key. else 0 */
174 int32_t * pos_array; /* may be tree or chain */
175 int before_size; /* bytes to keep in buffer before dictionary */
176 int buffer_size;
177 int dict_size; /* bytes to keep in buffer before pos */
178 int pos; /* current pos in buffer */
179 int cyclic_pos; /* cycles through [0, dict_size] */
180 int stream_pos; /* first byte not yet read from file */
181 int pos_limit; /* when reached, a new block must be read */
182 int key4_mask;
183 int num_prev_positions; /* size of prev_positions */
184 int pos_array_size;
185 int infd; /* input file descriptor */
186 bool at_stream_end; /* stream_pos shows real end of file */
187 };
188
189 bool Mb_read_block(Matchfinder_base *mb);
190 void Mb_normalize_pos(Matchfinder_base *mb);
191 bool Mb_init(Matchfinder_base *mb, int before, int dict_size, int after_size, int dict_factor, int num_prev_positions23, int pos_array_factor, int ifd);
192
193 static void
Mb_free(Matchfinder_base * mb)194 Mb_free(Matchfinder_base *mb)
195 {
196 free(mb->prev_positions);
197 free(mb->buffer);
198 }
199
200 static int
Mb_avail_bytes(Matchfinder_base * mb)201 Mb_avail_bytes(Matchfinder_base *mb)
202 {
203 return mb->stream_pos - mb->pos;
204 }
205
206 static uvlong
Mb_data_position(Matchfinder_base * mb)207 Mb_data_position(Matchfinder_base *mb)
208 {
209 return mb->partial_data_pos + mb->pos;
210 }
211
212 static bool
Mb_data_finished(Matchfinder_base * mb)213 Mb_data_finished(Matchfinder_base *mb)
214 {
215 return mb->at_stream_end && mb->pos >= mb->stream_pos;
216 }
217
218 static int
Mb_true_match_len(Matchfinder_base * mb,int index,int distance)219 Mb_true_match_len(Matchfinder_base *mb, int index, int distance)
220 {
221 uchar * data = mb->buffer + mb->pos;
222 int i = index;
223 int len_limit = min(Mb_avail_bytes(mb), max_match_len);
224 while (i < len_limit && data[i-distance] == data[i])
225 ++i;
226 return i;
227 }
228
229 static void
Mb_move_pos(Matchfinder_base * mb)230 Mb_move_pos(Matchfinder_base *mb)
231 {
232 if (++mb->cyclic_pos > mb->dict_size)
233 mb->cyclic_pos = 0;
234 if (++mb->pos >= mb->pos_limit)
235 Mb_normalize_pos(mb);
236 }
237
238 void Mb_reset(Matchfinder_base *mb);
239
240 enum { re_buffer_size = 65536 };
241
242 typedef struct LZ_encoder_base LZ_encoder_base;
243 typedef struct Matchfinder_base Matchfinder_base;
244 typedef struct Range_encoder Range_encoder;
245
246 struct Range_encoder {
247 uvlong low;
248 uvlong partial_member_pos;
249 uchar * buffer; /* output buffer */
250 int pos; /* current pos in buffer */
251 uint32_t range;
252 unsigned ff_count;
253 int outfd; /* output file descriptor */
254 uchar cache;
255 File_header header;
256 };
257
258 void Re_flush_data(Range_encoder *renc);
259
260 static void
Re_put_byte(Range_encoder * renc,uchar b)261 Re_put_byte(Range_encoder *renc, uchar b)
262 {
263 renc->buffer[renc->pos] = b;
264 if (++renc->pos >= re_buffer_size)
265 Re_flush_data(renc);
266 }
267
268 static void
Re_shift_low(Range_encoder * renc)269 Re_shift_low(Range_encoder *renc)
270 {
271 if (renc->low >> 24 != 0xFF) {
272 bool carry = (renc->low > 0xFFFFFFFFU);
273 Re_put_byte(renc, renc->cache + carry);
274 for (; renc->ff_count > 0; --renc->ff_count)
275 Re_put_byte(renc, 0xFF + carry);
276 renc->cache = renc->low >> 24;
277 } else
278 ++renc->ff_count;
279 renc->low = (renc->low & 0x00FFFFFFU) << 8;
280 }
281
282 static void
Re_reset(Range_encoder * renc)283 Re_reset(Range_encoder *renc)
284 {
285 int i;
286 renc->low = 0;
287 renc->partial_member_pos = 0;
288 renc->pos = 0;
289 renc->range = 0xFFFFFFFFU;
290 renc->ff_count = 0;
291 renc->cache = 0;
292 for (i = 0; i < Fh_size; ++i)
293 Re_put_byte(renc, renc->header[i]);
294 }
295
296 static bool
Re_init(Range_encoder * renc,unsigned dict_size,int ofd)297 Re_init(Range_encoder *renc, unsigned dict_size, int ofd)
298 {
299 renc->buffer = (uchar *)malloc(re_buffer_size);
300 if (!renc->buffer)
301 return false;
302 renc->outfd = ofd;
303 Fh_set_magic(renc->header);
304 Fh_set_dict_size(renc->header, dict_size);
305 Re_reset(renc);
306 return true;
307 }
308
309 static void
Re_free(Range_encoder * renc)310 Re_free(Range_encoder *renc)
311 {
312 free(renc->buffer);
313 }
314
315 static uvlong
Re_member_position(Range_encoder * renc)316 Re_member_position(Range_encoder *renc)
317 {
318 return renc->partial_member_pos + renc->pos + renc->ff_count;
319 }
320
321 static void
Re_flush(Range_encoder * renc)322 Re_flush(Range_encoder *renc)
323 {
324 int i;
325 for (i = 0; i < 5; ++i)
326 Re_shift_low(renc);
327 }
328
329 static void
Re_encode(Range_encoder * renc,int symbol,int num_bits)330 Re_encode(Range_encoder *renc, int symbol, int num_bits)
331 {
332 unsigned mask;
333 for (mask = 1 << (num_bits - 1); mask > 0; mask >>= 1) {
334 renc->range >>= 1;
335 if (symbol & mask)
336 renc->low += renc->range;
337 if (renc->range <= 0x00FFFFFFU) {
338 renc->range <<= 8;
339 Re_shift_low(renc);
340 }
341 }
342 }
343
344 static void
Re_encode_bit(Range_encoder * renc,Bit_model * probability,bool bit)345 Re_encode_bit(Range_encoder *renc, Bit_model *probability, bool bit)
346 {
347 Bit_model prob = *probability;
348 uint32_t bound = (renc->range >> bit_model_total_bits) * prob;
349
350 if (!bit) {
351 renc->range = bound;
352 *probability += (bit_model_total - prob) >> bit_model_move_bits;
353 } else {
354 renc->low += bound;
355 renc->range -= bound;
356 *probability -= prob >> bit_model_move_bits;
357 }
358 if (renc->range <= 0x00FFFFFFU) {
359 renc->range <<= 8;
360 Re_shift_low(renc);
361 }
362 }
363
364 static void
Re_encode_tree3(Range_encoder * renc,Bit_model bm[],int symbol)365 Re_encode_tree3(Range_encoder *renc, Bit_model bm[], int symbol)
366 {
367 int model = 1;
368 bool bit = (symbol >> 2) & 1;
369
370 Re_encode_bit(renc, &bm[model], bit);
371 model = (model << 1) | bit;
372 bit = (symbol >> 1) & 1;
373 Re_encode_bit(renc, &bm[model], bit);
374 model = (model << 1) | bit;
375 Re_encode_bit(renc, &bm[model], symbol & 1);
376 }
377
378 static void
Re_encode_tree6(Range_encoder * renc,Bit_model bm[],unsigned symbol)379 Re_encode_tree6(Range_encoder *renc, Bit_model bm[], unsigned symbol)
380 {
381 int model = 1;
382 bool bit = (symbol >> 5) & 1;
383 Re_encode_bit(renc, &bm[model], bit);
384 model = (model << 1) | bit;
385 bit = (symbol >> 4) & 1;
386 Re_encode_bit(renc, &bm[model], bit);
387 model = (model << 1) | bit;
388 bit = (symbol >> 3) & 1;
389 Re_encode_bit(renc, &bm[model], bit);
390 model = (model << 1) | bit;
391 bit = (symbol >> 2) & 1;
392 Re_encode_bit(renc, &bm[model], bit);
393 model = (model << 1) | bit;
394 bit = (symbol >> 1) & 1;
395 Re_encode_bit(renc, &bm[model], bit);
396 model = (model << 1) | bit;
397 Re_encode_bit(renc, &bm[model], symbol & 1);
398 }
399
400 static void
Re_encode_tree8(Range_encoder * renc,Bit_model bm[],int symbol)401 Re_encode_tree8(Range_encoder *renc, Bit_model bm[], int symbol)
402 {
403 int model = 1;
404 int i;
405 for (i = 7; i >= 0; --i) {
406 bool bit = (symbol >> i) & 1;
407 Re_encode_bit(renc, &bm[model], bit);
408 model = (model << 1) | bit;
409 }
410 }
411
412 static void
Re_encode_tree_reversed(Range_encoder * renc,Bit_model bm[],int symbol,int num_bits)413 Re_encode_tree_reversed(Range_encoder *renc, Bit_model bm[], int symbol, int num_bits)
414 {
415 int model = 1;
416 int i;
417 for (i = num_bits; i > 0; --i) {
418 bool bit = symbol & 1;
419 symbol >>= 1;
420 Re_encode_bit(renc, &bm[model], bit);
421 model = (model << 1) | bit;
422 }
423 }
424
425 static void
Re_encode_matched(Range_encoder * renc,Bit_model bm[],unsigned symbol,unsigned match_byte)426 Re_encode_matched(Range_encoder *renc, Bit_model bm[], unsigned symbol, unsigned match_byte)
427 {
428 unsigned mask = 0x100;
429 symbol |= mask;
430 while (true) {
431 unsigned match_bit = (match_byte <<= 1) & mask;
432 bool bit = (symbol <<= 1) & 0x100;
433 Re_encode_bit(renc, &bm[(symbol>>9)+match_bit+mask], bit);
434 if (symbol >= 0x10000)
435 break;
436 mask &= ~(match_bit ^ symbol);
437 /* if(match_bit != bit) mask = 0; */
438 }
439 }
440
441 static void
Re_encode_len(struct Range_encoder * renc,Len_model * lm,int symbol,int pos_state)442 Re_encode_len(struct Range_encoder *renc, Len_model *lm, int symbol, int pos_state)
443 {
444 bool bit = ((symbol -= min_match_len) >= len_low_syms);
445 Re_encode_bit(renc, &lm->choice1, bit);
446 if (!bit)
447 Re_encode_tree3(renc, lm->bm_low[pos_state], symbol);
448 else {
449 bit = ((symbol -= len_low_syms) >= len_mid_syms);
450 Re_encode_bit(renc, &lm->choice2, bit);
451 if (!bit)
452 Re_encode_tree3(renc, lm->bm_mid[pos_state], symbol);
453 else
454 Re_encode_tree8(renc, lm->bm_high, symbol - len_mid_syms);
455 }
456 }
457
458 enum {
459 max_marker_size = 16,
460 num_rep_distances = 4 /* must be 4 */
461 };
462
463 struct LZ_encoder_base {
464 struct Matchfinder_base mb;
465 uint32_t crc;
466
467 Bit_model bm_literal[1<<literal_context_bits][0x300];
468 Bit_model bm_match[states][pos_states];
469 Bit_model bm_rep[states];
470 Bit_model bm_rep0[states];
471 Bit_model bm_rep1[states];
472 Bit_model bm_rep2[states];
473 Bit_model bm_len[states][pos_states];
474 Bit_model bm_dis_slot[len_states][1<<dis_slot_bits];
475 Bit_model bm_dis[modeled_distances-end_dis_model+1];
476 Bit_model bm_align[dis_align_size];
477 struct Len_model match_len_model;
478 struct Len_model rep_len_model;
479 struct Range_encoder renc;
480 };
481
482 void LZeb_reset(LZ_encoder_base *eb);
483
484 static bool
LZeb_init(LZ_encoder_base * eb,int before,int dict_size,int after_size,int dict_factor,int num_prev_positions23,int pos_array_factor,int ifd,int outfd)485 LZeb_init(LZ_encoder_base *eb, int before, int dict_size, int after_size, int dict_factor, int num_prev_positions23, int pos_array_factor, int ifd, int outfd)
486 {
487 if (!Mb_init(&eb->mb, before, dict_size, after_size, dict_factor,
488 num_prev_positions23, pos_array_factor, ifd))
489 return false;
490 if (!Re_init(&eb->renc, eb->mb.dict_size, outfd))
491 return false;
492 LZeb_reset(eb);
493 return true;
494 }
495
496 static void
LZeb_free(LZ_encoder_base * eb)497 LZeb_free(LZ_encoder_base *eb)
498 {
499 Re_free(&eb->renc);
500 Mb_free(&eb->mb);
501 }
502
503 static unsigned
LZeb_crc(LZ_encoder_base * eb)504 LZeb_crc(LZ_encoder_base *eb)
505 {
506 return eb->crc ^ 0xFFFFFFFFU;
507 }
508
509 static int
LZeb_price_literal(LZ_encoder_base * eb,uchar prev_byte,uchar symbol)510 LZeb_price_literal(LZ_encoder_base *eb, uchar prev_byte, uchar symbol)
511 {
512 return price_symbol8(eb->bm_literal[get_lit_state(prev_byte)], symbol);
513 }
514
515 static int
LZeb_price_matched(LZ_encoder_base * eb,uchar prev_byte,uchar symbol,uchar match_byte)516 LZeb_price_matched(LZ_encoder_base *eb, uchar prev_byte, uchar symbol, uchar match_byte)
517 {
518 return price_matched(eb->bm_literal[get_lit_state(prev_byte)], symbol,
519 match_byte);
520 }
521
522 static void
LZeb_encode_literal(LZ_encoder_base * eb,uchar prev_byte,uchar symbol)523 LZeb_encode_literal(LZ_encoder_base *eb, uchar prev_byte, uchar symbol)
524 {
525 Re_encode_tree8(&eb->renc, eb->bm_literal[get_lit_state(prev_byte)],
526 symbol);
527 }
528
529 static void
LZeb_encode_matched(LZ_encoder_base * eb,uchar prev_byte,uchar symbol,uchar match_byte)530 LZeb_encode_matched(LZ_encoder_base *eb, uchar prev_byte, uchar symbol, uchar match_byte)
531 {
532 Re_encode_matched(&eb->renc, eb->bm_literal[get_lit_state(prev_byte)],
533 symbol, match_byte);
534 }
535
536 static void
LZeb_encode_pair(LZ_encoder_base * eb,unsigned dis,int len,int pos_state)537 LZeb_encode_pair(LZ_encoder_base *eb, unsigned dis, int len, int pos_state)
538 {
539 unsigned dis_slot = get_slot(dis);
540 Re_encode_len(&eb->renc, &eb->match_len_model, len, pos_state);
541 Re_encode_tree6(&eb->renc, eb->bm_dis_slot[get_len_state(len)], dis_slot);
542
543 if (dis_slot >= start_dis_model) {
544 int direct_bits = (dis_slot >> 1) - 1;
545 unsigned base = (2 | (dis_slot & 1)) << direct_bits;
546 unsigned direct_dis = dis - base;
547
548 if (dis_slot < end_dis_model)
549 Re_encode_tree_reversed(&eb->renc, eb->bm_dis + (base - dis_slot),
550 direct_dis, direct_bits);
551 else {
552 Re_encode(&eb->renc, direct_dis >> dis_align_bits,
553 direct_bits - dis_align_bits);
554 Re_encode_tree_reversed(&eb->renc, eb->bm_align, direct_dis, dis_align_bits);
555 }
556 }
557 }
558
559 void LZeb_full_flush(LZ_encoder_base *eb, State state);
560