1 /*
2 * Extraordinarily brute force Lempel & Ziv-like
3 * compressor. The input file must fit in memory
4 * during compression, and the output file will
5 * be reconstructed in memory during decompression.
6 * We search for large common sequences and use a
7 * greedy algorithm to choose which sequence gets
8 * compressed first.
9 *
10 * Files begin with "BLZ\n" and a 32-bit uncompressed file length.
11 *
12 * Output format is a series of blocks followed by
13 * a raw data section. Each block begins with a 32-bit big-endian
14 * number. The top bit is type and the next 31 bits
15 * are uncompressed size. Type is one of
16 * 0 - use raw data for this length
17 * 1 - a 32-bit offset follows
18 * After the blocks come the raw data. (The end of the blocks can be
19 * noted by summing block lengths until you reach the file length.)
20 */
21
22 #include <u.h>
23 #include <libc.h>
24 #include <bio.h>
25
26 #define malloc sbrk
27
28 int minrun = 16;
29 int win = 16;
30 ulong outn;
31 int verbose;
32 int mindist;
33
34 enum { Prime = 16777213 }; /* smallest prime < 2^24 (so p*256+256 < 2^32) */
35 enum { NOFF = 3 };
36
37 Biobuf bout;
38 ulong length;
39 uchar *data;
40 ulong sum32(ulong, void*, long);
41 uchar *odat;
42 int nodat;
43 int nraw;
44 int rawstart;
45 int acct;
46 int zlength;
47 int maxchain;
48 int maxrle[256];
49 int nnew;
50
51 typedef struct Node Node;
52 struct Node {
53 Node *link;
54 ulong key;
55 ulong offset[NOFF];
56 };
57
58 Node *nodepool;
59 int nnodepool;
60
61 Node **hash;
62 uint nhash;
63
64 uint maxlen;
65 uint maxsame;
66 uint replacesame = 8*1024*1024;
67
68 Node *freelist, **freeend;
69 uint nalloc;
70
71 Node*
allocnode(void)72 allocnode(void)
73 {
74 int i;
75 Node *n;
76
77 if(nnodepool == 0){
78 nnodepool = 256*1024;
79 nodepool = malloc(sizeof(Node)*nnodepool);
80 }
81 if(freelist){
82 n = freelist;
83 freelist = n->link;
84 return n;
85 }
86 assert(nnodepool > 0);
87 nalloc++;
88 n = &nodepool[--nnodepool];
89 for(i=0; i<NOFF; i++)
90 n->offset[i] = -1;
91
92 return n;
93 }
94
95 void
freenode(Node * n)96 freenode(Node *n)
97 {
98 if(freelist == nil)
99 freelist = n;
100 else
101 *freeend = n;
102 freeend = &n->link;
103 n->link = nil;
104 }
105
106 Node**
llookup(ulong key)107 llookup(ulong key)
108 {
109 uint c;
110 Node **l, **top, *n;
111
112 if(nhash == 0){
113 uint x;
114
115 x = length/8;
116 for(nhash=1; nhash<x; nhash<<=1)
117 ;
118 hash = sbrk(sizeof(Node*)*nhash);
119 }
120
121 top = &hash[key&(nhash-1)];
122 c = 0;
123 for(l=top; *l; l=&(*l)->link){
124 c++;
125 if((*l)->key == key){
126 /* move to front */
127 n = *l;
128 *l = n->link;
129 n->link = *top;
130 *top = n;
131 return top;
132 }
133 }
134 if(c > maxlen)
135 maxlen = c;
136 return l;
137 }
138
139 Node*
lookup(ulong key)140 lookup(ulong key)
141 {
142 return *llookup(key);
143 }
144
145 void
insertnode(ulong key,ulong offset)146 insertnode(ulong key, ulong offset)
147 {
148 int i;
149 Node *n, **l;
150
151 l = llookup(key);
152 if(*l == nil){
153 if(l==&hash[key&(nhash-1)])
154 nnew++;
155 *l = allocnode();
156 (*l)->key = key;
157 }
158 n = *l;
159
160 /* add or replace last */
161 for(i=0; i<NOFF-1 && n->offset[i]!=-1; i++)
162 ;
163 n->offset[i] = offset;
164 }
165
166 void
Bputint(Biobufhdr * b,int n)167 Bputint(Biobufhdr *b, int n)
168 {
169 uchar p[4];
170
171 p[0] = n>>24;
172 p[1] = n>>16;
173 p[2] = n>>8;
174 p[3] = n;
175 Bwrite(b, p, 4);
176 }
177
178 void
flushraw(void)179 flushraw(void)
180 {
181 if(nraw){
182 if(verbose)
183 fprint(2, "Raw %d+%d\n", rawstart, nraw);
184 zlength += 4+nraw;
185 Bputint(&bout, (1<<31)|nraw);
186 memmove(odat+nodat, data+rawstart, nraw);
187 nodat += nraw;
188 nraw = 0;
189 }
190 }
191
192 int
rawbyte(int i)193 rawbyte(int i)
194 {
195 assert(acct == i);
196 if(nraw == 0)
197 rawstart = i;
198 acct++;
199 nraw++;
200 return 1;
201 }
202
203 int
refblock(int i,int len,int off)204 refblock(int i, int len, int off)
205 {
206 assert(acct == i);
207 acct += len;
208 if(nraw)
209 flushraw();
210 if(verbose)
211 fprint(2, "Copy %d+%d from %d\n", i, len, off);
212 Bputint(&bout, len);
213 Bputint(&bout, off);
214 zlength += 4+4;
215 return len;
216 }
217
218 int
cmprun(uchar * a,uchar * b,int len)219 cmprun(uchar *a, uchar *b, int len)
220 {
221 int i;
222
223 if(a==b)
224 return 0;
225 for(i=0; i<len && a[i]==b[i]; i++)
226 ;
227 return i;
228 }
229
230 int
countrle(uchar * a)231 countrle(uchar *a)
232 {
233 uchar a0;
234 uchar *p;
235
236 p = a;
237 a0 = *p;
238 while(*++p == a0)
239 ;
240 return p - a;
241 }
242
243 void
compress(void)244 compress(void)
245 {
246 int best, i, j, o, rle, run, maxrun, maxoff;
247 ulong sum;
248 Node *n;
249
250 sum = 0;
251 for(i=0; i<win && i<length; i++)
252 sum = (sum*256+data[i])%Prime;
253 for(i=0; i<length-win; ){
254 maxrun = 0;
255 maxoff = 0;
256 if(verbose)
257 fprint(2, "look %.6lux\n", sum);
258 n = lookup(sum);
259 if(n){
260 best = -1;
261 for(o=0; o<NOFF; o++){
262 if(n->offset[o] == -1)
263 break;
264 run = cmprun(data+i, data+n->offset[o], length-i);
265 if(run > maxrun && n->offset[o]+mindist < i){
266 maxrun = run;
267 maxoff = n->offset[o];
268 best = o;
269 }
270 }
271 if(best > 0){
272 o = n->offset[best];
273 for(j=best; j>0; j--)
274 n->offset[j] = n->offset[j-1];
275 n->offset[0] = o;
276 }
277 }
278
279 if(maxrun >= minrun)
280 j = i+refblock(i, maxrun, maxoff);
281 else
282 j = i+rawbyte(i);
283 for(; i<j; i++){
284 /* avoid huge chains from large runs of same byte */
285 rle = countrle(data+i);
286 if(rle<4)
287 insertnode(sum, i);
288 else if(rle>maxrle[data[i]]){
289 maxrle[data[i]] = rle;
290 insertnode(sum, i);
291 }
292 sum = (sum*256+data[i+win]) % Prime;
293 sum = (sum + data[i]*outn) % Prime;
294 }
295 }
296 /* could do better here */
297 for(; i<length; i++)
298 rawbyte(i);
299 flushraw();
300 }
301
302 void
usage(void)303 usage(void)
304 {
305 fprint(2, "usage: bflz [-n winsize] [file]\n");
306 exits("usage");
307 }
308
309 void
main(int argc,char ** argv)310 main(int argc, char **argv)
311 {
312 int fd, i, n;
313 char buf[10485760];
314
315 ARGBEGIN{
316 case 'd':
317 verbose = 1;
318 break;
319 case 's':
320 replacesame = atoi(EARGF(usage()));
321 break;
322 case 'm':
323 mindist = atoi(EARGF(usage()));
324 break;
325 case 'n':
326 win = atoi(EARGF(usage()));
327 minrun = win;
328 break;
329 default:
330 usage();
331 }ARGEND
332
333 switch(argc){
334 default:
335 usage();
336 case 0:
337 fd = 0;
338 break;
339 case 1:
340 if((fd = open(argv[0], OREAD)) < 0)
341 sysfatal("open %s: %r", argv[0]);
342 break;
343 }
344
345 while((n = readn(fd, buf, sizeof buf)) > 0){
346 data = realloc(data, length+n);
347 if(data == nil)
348 sysfatal("realloc: %r");
349 memmove(data+length, buf, n);
350 length += n;
351 if(n < sizeof buf)
352 break;
353 }
354 odat = malloc(length);
355 if(odat == nil)
356 sysfatal("malloc: %r");
357
358 Binit(&bout, 1, OWRITE);
359 Bprint(&bout, "BLZ\n");
360 Bputint(&bout, length);
361 outn = 1;
362 for(i=0; i<win; i++)
363 outn = (outn * 256) % Prime;
364
365 if(verbose)
366 fprint(2, "256^%d = %.6lux\n", win, outn);
367 outn = Prime - outn;
368 if(verbose)
369 fprint(2, "outn = %.6lux\n", outn);
370
371 compress();
372 Bwrite(&bout, odat, nodat);
373 Bterm(&bout);
374 fprint(2, "brk %p\n", sbrk(1));
375 fprint(2, "%d nodes used; %d of %d hash slots used\n", nalloc, nnew, nhash);
376 exits(nil);
377 }
378