xref: /plan9/sys/lib/dist/cmd/bflz.c (revision ff8c3af2f44d95267f67219afa20ba82ff6cf7e4)
1 /*
2  * Extraordinarily brute force Lempel & Ziv-like
3  * compressor.  The input file must fit in memory
4  * during compression, and the output file will
5  * be reconstructed in memory during decompression.
6  * We search for large common sequences and use a
7  * greedy algorithm to choose which sequence gets
8  * compressed first.
9  *
10  * Files begin with "BLZ\n" and a 32-bit uncompressed file length.
11  *
12  * Output format is a series of blocks followed by
13  * a raw data section.  Each block begins with a 32-bit big-endian
14  * number.  The top bit is type and the next 31 bits
15  * are uncompressed size.  Type is one of
16  *	0 - use raw data for this length
17  *	1 - a 32-bit offset follows
18  * After the blocks come the raw data.  (The end of the blocks can be
19  * noted by summing block lengths until you reach the file length.)
20  */
21 
22 #include <u.h>
23 #include <libc.h>
24 #include <bio.h>
25 
26 #define malloc sbrk
27 
28 int minrun = 16;
29 int win = 16;
30 ulong outn;
31 int verbose;
32 int mindist;
33 
34 enum { Prime = 16777213 };	/* smallest prime < 2^24 (so p*256+256 < 2^32) */
35 enum { NOFF = 3 };
36 
37 Biobuf bout;
38 ulong length;
39 uchar *data;
40 ulong sum32(ulong, void*, long);
41 uchar *odat;
42 int nodat;
43 int nraw;
44 int rawstart;
45 int acct;
46 int zlength;
47 int maxchain;
48 int maxrle[256];
49 int nnew;
50 
51 typedef struct Node Node;
52 struct Node {
53 	Node *link;
54 	ulong key;
55 	ulong offset[NOFF];
56 };
57 
58 Node *nodepool;
59 int nnodepool;
60 
61 Node **hash;
62 uint nhash;
63 
64 uint maxlen;
65 uint maxsame;
66 uint replacesame = 8*1024*1024;
67 
68 Node *freelist, **freeend;
69 uint nalloc;
70 
71 Node*
72 allocnode(void)
73 {
74 	int i;
75 	Node *n;
76 
77 	if(nnodepool == 0){
78 		nnodepool = 256*1024;
79 		nodepool = malloc(sizeof(Node)*nnodepool);
80 	}
81 	if(freelist){
82 		n = freelist;
83 		freelist = n->link;
84 		return n;
85 	}
86 	assert(nnodepool > 0);
87 	nalloc++;
88 	n = &nodepool[--nnodepool];
89 	for(i=0; i<NOFF; i++)
90 		n->offset[i] = -1;
91 
92 	return n;
93 }
94 
95 void
96 freenode(Node *n)
97 {
98 	if(freelist == nil)
99 		freelist = n;
100 	else
101 		*freeend = n;
102 	freeend = &n->link;
103 	n->link = nil;
104 }
105 
106 Node**
107 llookup(ulong key)
108 {
109 	uint c;
110 	Node **l, **top, *n;
111 
112 	if(nhash == 0){
113 		uint x;
114 
115 		x = length/8;
116 		for(nhash=1; nhash<x; nhash<<=1)
117 			;
118 		hash = sbrk(sizeof(Node*)*nhash);
119 	}
120 
121 	top = &hash[key&(nhash-1)];
122 	c = 0;
123 	for(l=top; *l; l=&(*l)->link){
124 		c++;
125 		if((*l)->key == key){
126 			/* move to front */
127 			n = *l;
128 			*l = n->link;
129 			n->link = *top;
130 			*top = n;
131 			return top;
132 		}
133 	}
134 	if(c > maxlen)
135 		maxlen = c;
136 	return l;
137 }
138 
139 Node*
140 lookup(ulong key)
141 {
142 	return *llookup(key);
143 }
144 
145 void
146 insertnode(ulong key, ulong offset)
147 {
148 	int i;
149 	Node *n, **l;
150 
151 	l = llookup(key);
152 	if(*l == nil){
153 		if(l==&hash[key&(nhash-1)])
154 			nnew++;
155 		*l = allocnode();
156 		(*l)->key = key;
157 	}
158 	n = *l;
159 
160 	/* add or replace last */
161 	for(i=0; i<NOFF-1 && n->offset[i]!=-1; i++)
162 		;
163 	n->offset[i] = offset;
164 }
165 
166 void
167 Bputint(Biobufhdr *b, int n)
168 {
169 	uchar p[4];
170 
171 	p[0] = n>>24;
172 	p[1] = n>>16;
173 	p[2] = n>>8;
174 	p[3] = n;
175 	Bwrite(b, p, 4);
176 }
177 
178 void
179 flushraw(void)
180 {
181 	if(nraw){
182 		if(verbose)
183 			fprint(2, "Raw %d+%d\n", rawstart, nraw);
184 		zlength += 4+nraw;
185 		Bputint(&bout, (1<<31)|nraw);
186 		memmove(odat+nodat, data+rawstart, nraw);
187 		nodat += nraw;
188 		nraw = 0;
189 	}
190 }
191 
192 int
193 rawbyte(int i)
194 {
195 	assert(acct == i);
196 	if(nraw == 0)
197 		rawstart = i;
198 	acct++;
199 	nraw++;
200 	return 1;
201 }
202 
203 int
204 refblock(int i, int len, int off)
205 {
206 	assert(acct == i);
207 	acct += len;
208 	if(nraw)
209 		flushraw();
210 	if(verbose)
211 		fprint(2, "Copy %d+%d from %d\n", i, len, off);
212 	Bputint(&bout, len);
213 	Bputint(&bout, off);
214 	zlength += 4+4;
215 	return len;
216 }
217 
218 int
219 cmprun(uchar *a, uchar *b, int len)
220 {
221 	int i;
222 
223 	if(a==b)
224 		return 0;
225 	for(i=0; i<len && a[i]==b[i]; i++)
226 		;
227 	return i;
228 }
229 
230 int
231 countrle(uchar *a)
232 {
233 	int i;
234 
235 	for(i=0; a[i]==a[0]; i++)
236 		;
237 	return i;
238 }
239 
240 void
241 compress(void)
242 {
243 	int best, i, j, o, rle, run, maxrun, maxoff;
244 	ulong sum;
245 	Node *n;
246 
247 	sum = 0;
248 	for(i=0; i<win && i<length; i++)
249 		sum = (sum*256+data[i])%Prime;
250 	for(i=0; i<length-win; ){
251 		maxrun = 0;
252 		maxoff = 0;
253 		if(verbose)
254 			fprint(2, "look %.6lux\n", sum);
255 		n = lookup(sum);
256 		if(n){
257 			best = -1;
258 			for(o=0; o<NOFF; o++){
259 				if(n->offset[o] == -1)
260 					break;
261 				run = cmprun(data+i, data+n->offset[o], length-i);
262 				if(run > maxrun && n->offset[o]+mindist < i){
263 					maxrun = run;
264 					maxoff = n->offset[o];
265 					best = o;
266 				}
267 			}
268 			if(best > 0){
269 				o = n->offset[best];
270 				for(j=best; j>0; j--)
271 					n->offset[j] = n->offset[j-1];
272 				n->offset[0] = o;
273 			}
274 		}
275 
276 		if(maxrun >= minrun)
277 			j = i+refblock(i, maxrun, maxoff);
278 		else
279 			j = i+rawbyte(i);
280 		for(; i<j; i++){
281 			/* avoid huge chains from large runs of same byte */
282 			rle = countrle(data+i);
283 			if(rle<4)
284 				insertnode(sum, i);
285 			else if(rle>maxrle[data[i]]){
286 				maxrle[data[i]] = rle;
287 				insertnode(sum, i);
288 			}
289 			sum = (sum*256+data[i+win]) % Prime;
290 			sum = (sum + data[i]*outn) % Prime;
291 		}
292 	}
293 	/* could do better here */
294 	for(; i<length; i++)
295 		rawbyte(i);
296 	flushraw();
297 }
298 
299 void
300 usage(void)
301 {
302 	fprint(2, "usage: bflz [-n winsize] [file]\n");
303 	exits("usage");
304 }
305 
306 void
307 main(int argc, char **argv)
308 {
309 	int fd, i, n;
310 	char buf[10485760];
311 
312 	ARGBEGIN{
313 	case 'd':
314 		verbose = 1;
315 		break;
316 	case 's':
317 		replacesame = atoi(EARGF(usage()));
318 		break;
319 	case 'm':
320 		mindist = atoi(EARGF(usage()));
321 		break;
322 	case 'n':
323 		win = atoi(EARGF(usage()));
324 		minrun = win;
325 		break;
326 	default:
327 		usage();
328 	}ARGEND
329 
330 	switch(argc){
331 	default:
332 		usage();
333 	case 0:
334 		fd = 0;
335 		break;
336 	case 1:
337 		if((fd = open(argv[0], OREAD)) < 0)
338 			sysfatal("open %s: %r", argv[0]);
339 		break;
340 	}
341 
342 	while((n = readn(fd, buf, sizeof buf)) > 0){
343 		data = realloc(data, length+n);
344 		if(data == nil)
345 			sysfatal("realloc: %r");
346 		memmove(data+length, buf, n);
347 		length += n;
348 		if(n < sizeof buf)
349 			break;
350 	}
351 	odat = malloc(length);
352 	if(odat == nil)
353 		sysfatal("malloc: %r");
354 
355 	Binit(&bout, 1, OWRITE);
356 	Bprint(&bout, "BLZ\n");
357 	Bputint(&bout, length);
358 	outn = 1;
359 	for(i=0; i<win; i++)
360 		outn = (outn * 256) % Prime;
361 
362 	if(verbose)
363 		fprint(2, "256^%d = %.6lux\n", win, outn);
364 	outn = Prime - outn;
365 	if(verbose)
366 		fprint(2, "outn = %.6lux\n", outn);
367 
368 	compress();
369 	Bwrite(&bout, odat, nodat);
370 	Bterm(&bout);
371 	fprint(2, "brk %p\n", sbrk(1));
372 	fprint(2, "%d nodes used; %d of %d hash slots used\n", nalloc, nnew, nhash);
373 	exits(nil);
374 }
375