xref: /plan9/sys/lib/dist/cmd/bflz.c (revision 0a75e54a195f699ff0426de1f8ba80514066a83b)
1 /*
2  * Extraordinarily brute force Lempel & Ziv-like
3  * compressor.  The input file must fit in memory
4  * during compression, and the output file will
5  * be reconstructed in memory during decompression.
6  * We search for large common sequences and use a
7  * greedy algorithm to choose which sequence gets
8  * compressed first.
9  *
10  * Files begin with "BLZ\n" and a 32-bit uncompressed file length.
11  *
12  * Output format is a series of blocks followed by
13  * a raw data section.  Each block begins with a 32-bit big-endian
14  * number.  The top bit is type and the next 31 bits
15  * are uncompressed size.  Type is one of
16  *	0 - use raw data for this length
17  *	1 - a 32-bit offset follows
18  * After the blocks come the raw data.  (The end of the blocks can be
19  * noted by summing block lengths until you reach the file length.)
20  */
21 
22 #include <u.h>
23 #include <libc.h>
24 #include <bio.h>
25 
26 #define malloc sbrk
27 
28 int minrun = 16;
29 int win = 16;
30 ulong outn;
31 int verbose;
32 int mindist;
33 
34 enum { Prime = 16777213 };	/* smallest prime < 2^24 (so p*256+256 < 2^32) */
35 enum { NOFF = 3 };
36 
37 Biobuf bout;
38 ulong length;
39 uchar *data;
40 ulong sum32(ulong, void*, long);
41 uchar *odat;
42 int nodat;
43 int nraw;
44 int rawstart;
45 int acct;
46 int zlength;
47 int maxchain;
48 int maxrle[256];
49 int nnew;
50 
51 typedef struct Node Node;
52 struct Node {
53 	Node *link;
54 	ulong key;
55 	ulong offset[NOFF];
56 };
57 
58 Node *nodepool;
59 int nnodepool;
60 
61 Node **hash;
62 uint nhash;
63 
64 uint maxlen;
65 uint maxsame;
66 uint replacesame = 8*1024*1024;
67 
68 Node *freelist, **freeend;
69 uint nalloc;
70 
71 Node*
allocnode(void)72 allocnode(void)
73 {
74 	int i;
75 	Node *n;
76 
77 	if(nnodepool == 0){
78 		nnodepool = 256*1024;
79 		nodepool = malloc(sizeof(Node)*nnodepool);
80 	}
81 	if(freelist){
82 		n = freelist;
83 		freelist = n->link;
84 		return n;
85 	}
86 	assert(nnodepool > 0);
87 	nalloc++;
88 	n = &nodepool[--nnodepool];
89 	for(i=0; i<NOFF; i++)
90 		n->offset[i] = -1;
91 
92 	return n;
93 }
94 
95 void
freenode(Node * n)96 freenode(Node *n)
97 {
98 	if(freelist == nil)
99 		freelist = n;
100 	else
101 		*freeend = n;
102 	freeend = &n->link;
103 	n->link = nil;
104 }
105 
106 Node**
llookup(ulong key)107 llookup(ulong key)
108 {
109 	uint c;
110 	Node **l, **top, *n;
111 
112 	if(nhash == 0){
113 		uint x;
114 
115 		x = length/8;
116 		for(nhash=1; nhash<x; nhash<<=1)
117 			;
118 		hash = sbrk(sizeof(Node*)*nhash);
119 	}
120 
121 	top = &hash[key&(nhash-1)];
122 	c = 0;
123 	for(l=top; *l; l=&(*l)->link){
124 		c++;
125 		if((*l)->key == key){
126 			/* move to front */
127 			n = *l;
128 			*l = n->link;
129 			n->link = *top;
130 			*top = n;
131 			return top;
132 		}
133 	}
134 	if(c > maxlen)
135 		maxlen = c;
136 	return l;
137 }
138 
139 Node*
lookup(ulong key)140 lookup(ulong key)
141 {
142 	return *llookup(key);
143 }
144 
145 void
insertnode(ulong key,ulong offset)146 insertnode(ulong key, ulong offset)
147 {
148 	int i;
149 	Node *n, **l;
150 
151 	l = llookup(key);
152 	if(*l == nil){
153 		if(l==&hash[key&(nhash-1)])
154 			nnew++;
155 		*l = allocnode();
156 		(*l)->key = key;
157 	}
158 	n = *l;
159 
160 	/* add or replace last */
161 	for(i=0; i<NOFF-1 && n->offset[i]!=-1; i++)
162 		;
163 	n->offset[i] = offset;
164 }
165 
166 void
Bputint(Biobufhdr * b,int n)167 Bputint(Biobufhdr *b, int n)
168 {
169 	uchar p[4];
170 
171 	p[0] = n>>24;
172 	p[1] = n>>16;
173 	p[2] = n>>8;
174 	p[3] = n;
175 	Bwrite(b, p, 4);
176 }
177 
178 void
flushraw(void)179 flushraw(void)
180 {
181 	if(nraw){
182 		if(verbose)
183 			fprint(2, "Raw %d+%d\n", rawstart, nraw);
184 		zlength += 4+nraw;
185 		Bputint(&bout, (1<<31)|nraw);
186 		memmove(odat+nodat, data+rawstart, nraw);
187 		nodat += nraw;
188 		nraw = 0;
189 	}
190 }
191 
192 int
rawbyte(int i)193 rawbyte(int i)
194 {
195 	assert(acct == i);
196 	if(nraw == 0)
197 		rawstart = i;
198 	acct++;
199 	nraw++;
200 	return 1;
201 }
202 
203 int
refblock(int i,int len,int off)204 refblock(int i, int len, int off)
205 {
206 	assert(acct == i);
207 	acct += len;
208 	if(nraw)
209 		flushraw();
210 	if(verbose)
211 		fprint(2, "Copy %d+%d from %d\n", i, len, off);
212 	Bputint(&bout, len);
213 	Bputint(&bout, off);
214 	zlength += 4+4;
215 	return len;
216 }
217 
218 int
cmprun(uchar * a,uchar * b,int len)219 cmprun(uchar *a, uchar *b, int len)
220 {
221 	int i;
222 
223 	if(a==b)
224 		return 0;
225 	for(i=0; i<len && a[i]==b[i]; i++)
226 		;
227 	return i;
228 }
229 
230 int
countrle(uchar * a)231 countrle(uchar *a)
232 {
233 	uchar a0;
234 	uchar *p;
235 
236 	p = a;
237 	a0 = *p;
238 	while(*++p == a0)
239 		;
240 	return p - a;
241 }
242 
243 void
compress(void)244 compress(void)
245 {
246 	int best, i, j, o, rle, run, maxrun, maxoff;
247 	ulong sum;
248 	Node *n;
249 
250 	sum = 0;
251 	for(i=0; i<win && i<length; i++)
252 		sum = (sum*256+data[i])%Prime;
253 	for(i=0; i<length-win; ){
254 		maxrun = 0;
255 		maxoff = 0;
256 		if(verbose)
257 			fprint(2, "look %.6lux\n", sum);
258 		n = lookup(sum);
259 		if(n){
260 			best = -1;
261 			for(o=0; o<NOFF; o++){
262 				if(n->offset[o] == -1)
263 					break;
264 				run = cmprun(data+i, data+n->offset[o], length-i);
265 				if(run > maxrun && n->offset[o]+mindist < i){
266 					maxrun = run;
267 					maxoff = n->offset[o];
268 					best = o;
269 				}
270 			}
271 			if(best > 0){
272 				o = n->offset[best];
273 				for(j=best; j>0; j--)
274 					n->offset[j] = n->offset[j-1];
275 				n->offset[0] = o;
276 			}
277 		}
278 
279 		if(maxrun >= minrun)
280 			j = i+refblock(i, maxrun, maxoff);
281 		else
282 			j = i+rawbyte(i);
283 		for(; i<j; i++){
284 			/* avoid huge chains from large runs of same byte */
285 			rle = countrle(data+i);
286 			if(rle<4)
287 				insertnode(sum, i);
288 			else if(rle>maxrle[data[i]]){
289 				maxrle[data[i]] = rle;
290 				insertnode(sum, i);
291 			}
292 			sum = (sum*256+data[i+win]) % Prime;
293 			sum = (sum + data[i]*outn) % Prime;
294 		}
295 	}
296 	/* could do better here */
297 	for(; i<length; i++)
298 		rawbyte(i);
299 	flushraw();
300 }
301 
302 void
usage(void)303 usage(void)
304 {
305 	fprint(2, "usage: bflz [-n winsize] [file]\n");
306 	exits("usage");
307 }
308 
309 void
main(int argc,char ** argv)310 main(int argc, char **argv)
311 {
312 	int fd, i, n;
313 	char buf[10485760];
314 
315 	ARGBEGIN{
316 	case 'd':
317 		verbose = 1;
318 		break;
319 	case 's':
320 		replacesame = atoi(EARGF(usage()));
321 		break;
322 	case 'm':
323 		mindist = atoi(EARGF(usage()));
324 		break;
325 	case 'n':
326 		win = atoi(EARGF(usage()));
327 		minrun = win;
328 		break;
329 	default:
330 		usage();
331 	}ARGEND
332 
333 	switch(argc){
334 	default:
335 		usage();
336 	case 0:
337 		fd = 0;
338 		break;
339 	case 1:
340 		if((fd = open(argv[0], OREAD)) < 0)
341 			sysfatal("open %s: %r", argv[0]);
342 		break;
343 	}
344 
345 	while((n = readn(fd, buf, sizeof buf)) > 0){
346 		data = realloc(data, length+n);
347 		if(data == nil)
348 			sysfatal("realloc: %r");
349 		memmove(data+length, buf, n);
350 		length += n;
351 		if(n < sizeof buf)
352 			break;
353 	}
354 	odat = malloc(length);
355 	if(odat == nil)
356 		sysfatal("malloc: %r");
357 
358 	Binit(&bout, 1, OWRITE);
359 	Bprint(&bout, "BLZ\n");
360 	Bputint(&bout, length);
361 	outn = 1;
362 	for(i=0; i<win; i++)
363 		outn = (outn * 256) % Prime;
364 
365 	if(verbose)
366 		fprint(2, "256^%d = %.6lux\n", win, outn);
367 	outn = Prime - outn;
368 	if(verbose)
369 		fprint(2, "outn = %.6lux\n", outn);
370 
371 	compress();
372 	Bwrite(&bout, odat, nodat);
373 	Bterm(&bout);
374 	fprint(2, "brk %p\n", sbrk(1));
375 	fprint(2, "%d nodes used; %d of %d hash slots used\n", nalloc, nnew, nhash);
376 	exits(nil);
377 }
378