xref: /plan9/sys/lib/dist/cmd/bflz.c (revision 9a747e4fd48b9f4522c70c07e8f882a15030f964)
1 /*
2  * Extraordinarily brute force Lempel & Ziv-like
3  * compressor.  The input file must fit in memory
4  * during compression, and the output file will
5  * be reconstructed in memory during decompression.
6  * We search for large common sequences and use a
7  * greedy algorithm to choose which sequence gets
8  * compressed first.
9  *
10  * Files begin with "BLZ\n" and a 32-bit uncompressed file length.
11  *
12  * Output format is a series of blocks followed by
13  * a raw data section.  Each block begins with a 32-bit big-endian
14  * number.  The top bit is type and the next 31 bits
15  * are uncompressed size.  Type is one of
16  *	0 - use raw data for this length
17  *	1 - a 32-bit offset follows
18  * After the blocks come the raw data.  (The end of the blocks can be
19  * noted by summing block lengths until you reach the file length.)
20  */
21 
22 #include <u.h>
23 #include <libc.h>
24 #include <bio.h>
25 
26 int minrun = 16;
27 int win = 16;
28 ulong outn;
29 int verbose;
30 int mindist;
31 
32 enum { Prime = 16777213 };	/* smallest prime < 2^24 (so p*256+256 < 2^32) */
33 
34 Biobuf bout;
35 ulong length;
36 uchar *data;
37 ulong sum32(ulong, void*, long);
38 uchar *odat;
39 int nodat;
40 int nraw;
41 int rawstart;
42 int acct;
43 int zlength;
44 int maxchain;
45 int maxrle[256];
46 
47 typedef struct Node Node;
48 struct Node {
49 	ulong key;
50 	int offset;
51 	Node *left;
52 	Node *right;
53 	Node *link;
54 };
55 
56 Node *nodepool;
57 int nnodepool;
58 
59 Node *root;
60 
61 Node*
62 allocnode(void)
63 {
64 	if(nodepool == nil){
65 		nnodepool = length+win+1;
66 		nodepool = mallocz(sizeof(Node)*nnodepool, 1);
67 	}
68 	assert(nnodepool > 0);
69 	return &nodepool[--nnodepool];
70 }
71 
72 Node**
73 llookup(ulong key, int walkdown)
74 {
75 	Node **l;
76 
77 	l = &root;
78 	while(*l){
79 		if(key < (*l)->key)
80 			l = &(*l)->left;
81 		else if(key > (*l)->key)
82 			l = &(*l)->right;
83 		else{
84 			if(walkdown){
85 				while(*l)
86 					l=&(*l)->link;
87 			}
88 			break;
89 		}
90 	}
91 	return l;
92 }
93 
94 
95 void
96 insertnode(ulong key, ulong offset)
97 {
98 	Node **l, *n;
99 
100 	l = llookup(key, 1);
101 	n = allocnode();
102 	n->key = key;
103 	n->offset = offset;
104 	*l = n;
105 }
106 
107 Node*
108 lookup(ulong key)
109 {
110 	return *llookup(key, 0);
111 }
112 
113 void
114 Bputint(Biobufhdr *b, int n)
115 {
116 	uchar p[4];
117 
118 	p[0] = n>>24;
119 	p[1] = n>>16;
120 	p[2] = n>>8;
121 	p[3] = n;
122 	Bwrite(b, p, 4);
123 }
124 
125 void
126 flushraw(void)
127 {
128 	if(nraw){
129 		if(verbose)
130 			fprint(2, "Raw %d+%d\n", rawstart, nraw);
131 		zlength += 4+nraw;
132 		Bputint(&bout, (1<<31)|nraw);
133 		memmove(odat+nodat, data+rawstart, nraw);
134 		nodat += nraw;
135 		nraw = 0;
136 	}
137 }
138 
139 int
140 rawbyte(int i)
141 {
142 	assert(acct == i);
143 	if(nraw == 0)
144 		rawstart = i;
145 	acct++;
146 	nraw++;
147 	return 1;
148 }
149 
150 int
151 refblock(int i, int len, int off)
152 {
153 	assert(acct == i);
154 	acct += len;
155 	if(nraw)
156 		flushraw();
157 	if(verbose)
158 		fprint(2, "Copy %d+%d from %d\n", i, len, off);
159 	Bputint(&bout, len);
160 	Bputint(&bout, off);
161 	zlength += 4+4;
162 	return len;
163 }
164 
165 int
166 cmprun(uchar *a, uchar *b, int len)
167 {
168 	int i;
169 
170 	if(a==b)
171 		return 0;
172 	for(i=0; i<len && a[i]==b[i]; i++)
173 		;
174 	return i;
175 }
176 
177 int
178 countrle(uchar *a)
179 {
180 	int i;
181 
182 	for(i=0; a[i]==a[0]; i++)
183 		;
184 	return i;
185 }
186 
187 void
188 compress(void)
189 {
190 	int i, j, rle, run, maxrun, maxoff;
191 	ulong sum;
192 	Node *n;
193 
194 	sum = 0;
195 	for(i=0; i<win && i<length; i++)
196 		sum = (sum*256+data[i])%Prime;
197 	for(i=0; i<length-win; ){
198 		n = lookup(sum);
199 		maxrun = 0;
200 		maxoff = 0;
201 		if(verbose)
202 			fprint(2, "look %.6lux\n", sum);
203 		j=0;
204 		for(; n; n=n->link, j++){
205 			run = cmprun(data+i, data+n->offset, length-i);
206 			if(verbose)
207 				fprint(2, "(%d,%d)%d...", i, n->offset, run);
208 			if(run > maxrun && n->offset+mindist < i){
209 				maxrun = run;
210 				if(verbose)
211 					fprint(2, "[%d] (min %d)\n", maxrun, minrun);
212 				maxoff = n->offset;
213 			}
214 		}
215 		if(j > maxchain){
216 			if(verbose)
217 				fprint(2, "%.6lux %d\n", sum, j);
218 			maxchain = j;
219 		}
220 		if(maxrun >= minrun)
221 			j = i+refblock(i, maxrun, maxoff);
222 		else
223 			j = i+rawbyte(i);
224 		for(; i<j; i++){
225 			/* avoid huge chains from large runs of same byte */
226 			rle = countrle(data+i);
227 			if(rle<4)
228 				insertnode(sum, i);
229 			else if(rle>maxrle[data[i]]){
230 				maxrle[data[i]] = rle;
231 				insertnode(sum, i);
232 			}
233 			sum = (sum*256+data[i+win]) % Prime;
234 			sum = (sum + data[i]*outn) % Prime;
235 		}
236 	}
237 	/* could do better here */
238 	for(; i<length; i++)
239 		rawbyte(i);
240 	flushraw();
241 }
242 
243 void
244 usage(void)
245 {
246 	fprint(2, "usage: xzip [-n winsize] [file]\n");
247 	exits("usage");
248 }
249 
250 void
251 main(int argc, char **argv)
252 {
253 	int fd, i, n;
254 	char buf[10485760];
255 
256 	ARGBEGIN{
257 	case 'd':
258 		verbose = 1;
259 		break;
260 	case 'm':
261 		mindist = atoi(EARGF(usage()));
262 		break;
263 	case 'n':
264 		win = atoi(EARGF(usage()));
265 		minrun = win;
266 		break;
267 	default:
268 		usage();
269 	}ARGEND
270 
271 	switch(argc){
272 	default:
273 		usage();
274 	case 0:
275 		fd = 0;
276 		break;
277 	case 1:
278 		if((fd = open(argv[0], OREAD)) < 0)
279 			sysfatal("open %s: %r", argv[0]);
280 		break;
281 	}
282 
283 	while((n = readn(fd, buf, sizeof buf)) > 0){
284 		data = realloc(data, length+n);
285 		if(data == nil)
286 			sysfatal("realloc: %r");
287 		memmove(data+length, buf, n);
288 		length += n;
289 		if(n < sizeof buf)
290 			break;
291 	}
292 	odat = malloc(length);
293 	if(odat == nil)
294 		sysfatal("malloc: %r");
295 
296 	Binit(&bout, 1, OWRITE);
297 	Bprint(&bout, "BLZ\n");
298 	Bputint(&bout, length);
299 	outn = 1;
300 	for(i=0; i<win; i++)
301 		outn = (outn * 256) % Prime;
302 
303 	if(verbose)
304 		fprint(2, "256^%d = %.6lux\n", win, outn);
305 	outn = Prime - outn;
306 	if(verbose)
307 		fprint(2, "outn = %.6lux\n", outn);
308 
309 	compress();
310 	Bwrite(&bout, odat, nodat);
311 	Bterm(&bout);
312 	exits(nil);
313 }
314