xref: /plan9-contrib/sys/src/cmd/lzip/fast_encoder.c (revision 13d37d7716a3e781f408392d7869dff5927c6669)
1 /*  Clzip - LZMA lossless data compressor
2     Copyright (C) 2010-2017 Antonio Diaz Diaz.
3 
4     This program is free software: you can redistribute it and/or modify
5     it under the terms of the GNU General Public License as published by
6     the Free Software Foundation, either version 2 of the License, or
7     (at your option) any later version.
8 
9     This program is distributed in the hope that it will be useful,
10     but WITHOUT ANY WARRANTY; without even the implied warranty of
11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12     GNU General Public License for more details.
13 
14     You should have received a copy of the GNU General Public License
15     along with this program.  If not, see <http://www.gnu.org/licenses/>. */
16 
17 #include "lzip.h"
18 #include "encoder_base.h"
19 #include "fast_encoder.h"
20 
21 int
FLZe_longest_match_len(FLZ_encoder * fe,int * distance)22 FLZe_longest_match_len(FLZ_encoder *fe, int *distance)
23 {
24 	enum { len_limit = 16 	};
25 	uchar *data = Mb_ptr_to_current_pos(&fe->eb.mb);
26 	int32_t * ptr0 = fe->eb.mb.pos_array + fe->eb.mb.cyclic_pos;
27 	int pos1 = fe->eb.mb.pos + 1;
28 	int maxlen = 0, newpos1, count;
29 	int available = min(Mb_avail_bytes(&fe->eb.mb), max_match_len);
30 
31 	if (available < len_limit)
32 		return 0;
33 
34 	fe->key4 = ((fe->key4 << 4) ^ data[3]) & fe->eb.mb.key4_mask;
35 	newpos1 = fe->eb.mb.prev_positions[fe->key4];
36 	fe->eb.mb.prev_positions[fe->key4] = pos1;
37 
38 	for (count = 4; ;) {
39 		int32_t * newptr;
40 		int delta;
41 
42 		if (newpos1 <= 0 || --count < 0 ||
43 		    (delta = pos1 - newpos1) > fe->eb.mb.dict_size) {
44 			*ptr0 = 0;
45 			break;
46 		}
47 		newptr = fe->eb.mb.pos_array +
48 		    (fe->eb.mb.cyclic_pos - delta +
49 		    ((fe->eb.mb.cyclic_pos >= delta) ? 0 : fe->eb.mb.dict_size + 1));
50 
51 		if (data[maxlen-delta] == data[maxlen]) {
52 			int	len = 0;
53 			while (len < available && data[len-delta] == data[len])
54 				++len;
55 			if (maxlen < len) {
56 				maxlen = len;
57 				*distance = delta - 1;
58 				if (maxlen >= len_limit) {
59 					*ptr0 = *newptr;
60 					break;
61 				}
62 			}
63 		}
64 
65 		*ptr0 = newpos1;
66 		ptr0 = newptr;
67 		newpos1 = *ptr0;
68 	}
69 	return maxlen;
70 }
71 
72 bool
FLZe_encode_member(FLZ_encoder * fe,uvlong member_size)73 FLZe_encode_member(FLZ_encoder *fe, uvlong member_size)
74 {
75 	uvlong member_size_limit = member_size - Ft_size - max_marker_size;
76 	int rep = 0, i;
77 	int reps[num_rep_distances];
78 	State state = 0;
79 
80 	for (i = 0; i < num_rep_distances; ++i)
81 		reps[i] = 0;
82 
83 	if (Mb_data_position(&fe->eb.mb) != 0 ||
84 	    Re_member_position(&fe->eb.renc) != Fh_size)
85 		return false;			/* can be called only once */
86 
87 	if (!Mb_data_finished(&fe->eb.mb))	/* encode first byte */ {
88 		uchar prev_byte = 0;
89 		uchar cur_byte = Mb_peek(&fe->eb.mb, 0);
90 		Re_encode_bit(&fe->eb.renc, &fe->eb.bm_match[state][0], 0);
91 		LZeb_encode_literal(&fe->eb, prev_byte, cur_byte);
92 		CRC32_update_byte(&fe->eb.crc, cur_byte);
93 		FLZe_reset_key4(fe);
94 		FLZe_update_and_move(fe, 1);
95 	}
96 
97 	while (!Mb_data_finished(&fe->eb.mb) &&
98 	    Re_member_position(&fe->eb.renc) < member_size_limit) {
99 		int	match_distance;
100 		int main_len = FLZe_longest_match_len(fe, &match_distance);
101 		int pos_state = Mb_data_position(&fe->eb.mb) & pos_state_mask;
102 		int	len = 0;
103 
104 		for (i = 0; i < num_rep_distances; ++i) {
105 			int tlen = Mb_true_match_len(&fe->eb.mb, 0, reps[i] + 1);
106 			if (tlen > len) {
107 				len = tlen;
108 				rep = i;
109 			}
110 		}
111 		if (len > min_match_len && len + 3 > main_len) {
112 			CRC32_update_buf(&fe->eb.crc, Mb_ptr_to_current_pos(&fe->eb.mb), len);
113 			Re_encode_bit(&fe->eb.renc, &fe->eb.bm_match[state][pos_state], 1);
114 			Re_encode_bit(&fe->eb.renc, &fe->eb.bm_rep[state], 1);
115 			Re_encode_bit(&fe->eb.renc, &fe->eb.bm_rep0[state], rep != 0);
116 			if (rep == 0)
117 				Re_encode_bit(&fe->eb.renc, &fe->eb.bm_len[state][pos_state], 1);
118 			else {
119 				int	distance;
120 				Re_encode_bit(&fe->eb.renc, &fe->eb.bm_rep1[state], rep > 1);
121 				if (rep > 1)
122 					Re_encode_bit(&fe->eb.renc, &fe->eb.bm_rep2[state], rep > 2);
123 				distance = reps[rep];
124 				for (i = rep; i > 0; --i)
125 					reps[i] = reps[i-1];
126 				reps[0] = distance;
127 			}
128 			state = St_set_rep(state);
129 			Re_encode_len(&fe->eb.renc, &fe->eb.rep_len_model, len, pos_state);
130 			Mb_move_pos(&fe->eb.mb);
131 			FLZe_update_and_move(fe, len - 1);
132 			continue;
133 		}
134 
135 		if (main_len > min_match_len) {
136 			CRC32_update_buf(&fe->eb.crc, Mb_ptr_to_current_pos(&fe->eb.mb), main_len);
137 			Re_encode_bit(&fe->eb.renc, &fe->eb.bm_match[state][pos_state], 1);
138 			Re_encode_bit(&fe->eb.renc, &fe->eb.bm_rep[state], 0);
139 			state = St_set_match(state);
140 			for (i = num_rep_distances - 1; i > 0; --i)
141 				reps[i] = reps[i-1];
142 			reps[0] = match_distance;
143 			LZeb_encode_pair(&fe->eb, match_distance, main_len, pos_state);
144 			Mb_move_pos(&fe->eb.mb);
145 			FLZe_update_and_move(fe, main_len - 1);
146 			continue;
147 		}
148 
149 		{
150 			uchar prev_byte = Mb_peek(&fe->eb.mb, 1);
151 			uchar cur_byte = Mb_peek(&fe->eb.mb, 0);
152 			uchar match_byte = Mb_peek(&fe->eb.mb, reps[0] + 1);
153 			Mb_move_pos(&fe->eb.mb);
154 			CRC32_update_byte(&fe->eb.crc, cur_byte);
155 
156 			if (match_byte == cur_byte) {
157 				int short_rep_price = price1(fe->eb.bm_match[state][pos_state]) +
158 				    price1(fe->eb.bm_rep[state]) +
159 				    price0(fe->eb.bm_rep0[state]) +
160 				    price0(fe->eb.bm_len[state][pos_state]);
161 				int	price = price0(fe->eb.bm_match[state][pos_state]);
162 				if (St_is_char(state))
163 					price += LZeb_price_literal(&fe->eb, prev_byte, cur_byte);
164 				else
165 					price += LZeb_price_matched(&fe->eb, prev_byte, cur_byte, match_byte);
166 				if (short_rep_price < price) {
167 					Re_encode_bit(&fe->eb.renc, &fe->eb.bm_match[state][pos_state], 1);
168 					Re_encode_bit(&fe->eb.renc, &fe->eb.bm_rep[state], 1);
169 					Re_encode_bit(&fe->eb.renc, &fe->eb.bm_rep0[state], 0);
170 					Re_encode_bit(&fe->eb.renc, &fe->eb.bm_len[state][pos_state], 0);
171 					state = St_set_short_rep(state);
172 					continue;
173 				}
174 			}
175 
176 			/* literal byte */
177 			Re_encode_bit(&fe->eb.renc, &fe->eb.bm_match[state][pos_state], 0);
178 			if (St_is_char(state))
179 				LZeb_encode_literal(&fe->eb, prev_byte, cur_byte);
180 			else
181 				LZeb_encode_matched(&fe->eb, prev_byte, cur_byte, match_byte);
182 			state = St_set_char(state);
183 		}
184 	}
185 
186 	LZeb_full_flush(&fe->eb, state);
187 	return true;
188 }
189