1 /* 2 * Extraordinarily brute force Lempel & Ziv-like 3 * compressor. The input file must fit in memory 4 * during compression, and the output file will 5 * be reconstructed in memory during decompression. 6 * We search for large common sequences and use a 7 * greedy algorithm to choose which sequence gets 8 * compressed first. 9 * 10 * Files begin with "BLZ\n" and a 32-bit uncompressed file length. 11 * 12 * Output format is a series of blocks followed by 13 * a raw data section. Each block begins with a 32-bit big-endian 14 * number. The top bit is type and the next 31 bits 15 * are uncompressed size. Type is one of 16 * 0 - use raw data for this length 17 * 1 - a 32-bit offset follows 18 * After the blocks come the raw data. (The end of the blocks can be 19 * noted by summing block lengths until you reach the file length.) 20 */ 21 22 #include <u.h> 23 #include <libc.h> 24 #include <bio.h> 25 26 #define malloc sbrk 27 28 int minrun = 16; 29 int win = 16; 30 ulong outn; 31 int verbose; 32 int mindist; 33 34 enum { Prime = 16777213 }; /* smallest prime < 2^24 (so p*256+256 < 2^32) */ 35 enum { NOFF = 3 }; 36 37 Biobuf bout; 38 ulong length; 39 uchar *data; 40 ulong sum32(ulong, void*, long); 41 uchar *odat; 42 int nodat; 43 int nraw; 44 int rawstart; 45 int acct; 46 int zlength; 47 int maxchain; 48 int maxrle[256]; 49 int nnew; 50 51 typedef struct Node Node; 52 struct Node { 53 Node *link; 54 ulong key; 55 ulong offset[NOFF]; 56 }; 57 58 Node *nodepool; 59 int nnodepool; 60 61 Node **hash; 62 uint nhash; 63 64 uint maxlen; 65 uint maxsame; 66 uint replacesame = 8*1024*1024; 67 68 Node *freelist, **freeend; 69 uint nalloc; 70 71 Node* 72 allocnode(void) 73 { 74 int i; 75 Node *n; 76 77 if(nnodepool == 0){ 78 nnodepool = 256*1024; 79 nodepool = malloc(sizeof(Node)*nnodepool); 80 } 81 if(freelist){ 82 n = freelist; 83 freelist = n->link; 84 return n; 85 } 86 assert(nnodepool > 0); 87 nalloc++; 88 n = &nodepool[--nnodepool]; 89 for(i=0; i<NOFF; i++) 90 n->offset[i] = -1; 91 92 return n; 93 } 94 95 void 96 freenode(Node *n) 97 { 98 if(freelist == nil) 99 freelist = n; 100 else 101 *freeend = n; 102 freeend = &n->link; 103 n->link = nil; 104 } 105 106 Node** 107 llookup(ulong key) 108 { 109 uint c; 110 Node **l, **top, *n; 111 112 if(nhash == 0){ 113 uint x; 114 115 x = length/8; 116 for(nhash=1; nhash<x; nhash<<=1) 117 ; 118 hash = sbrk(sizeof(Node*)*nhash); 119 } 120 121 top = &hash[key&(nhash-1)]; 122 c = 0; 123 for(l=top; *l; l=&(*l)->link){ 124 c++; 125 if((*l)->key == key){ 126 /* move to front */ 127 n = *l; 128 *l = n->link; 129 n->link = *top; 130 *top = n; 131 return top; 132 } 133 } 134 if(c > maxlen) 135 maxlen = c; 136 return l; 137 } 138 139 Node* 140 lookup(ulong key) 141 { 142 return *llookup(key); 143 } 144 145 void 146 insertnode(ulong key, ulong offset) 147 { 148 int i; 149 Node *n, **l; 150 151 l = llookup(key); 152 if(*l == nil){ 153 if(l==&hash[key&(nhash-1)]) 154 nnew++; 155 *l = allocnode(); 156 (*l)->key = key; 157 } 158 n = *l; 159 160 /* add or replace last */ 161 for(i=0; i<NOFF-1 && n->offset[i]!=-1; i++) 162 ; 163 n->offset[i] = offset; 164 } 165 166 void 167 Bputint(Biobufhdr *b, int n) 168 { 169 uchar p[4]; 170 171 p[0] = n>>24; 172 p[1] = n>>16; 173 p[2] = n>>8; 174 p[3] = n; 175 Bwrite(b, p, 4); 176 } 177 178 void 179 flushraw(void) 180 { 181 if(nraw){ 182 if(verbose) 183 fprint(2, "Raw %d+%d\n", rawstart, nraw); 184 zlength += 4+nraw; 185 Bputint(&bout, (1<<31)|nraw); 186 memmove(odat+nodat, data+rawstart, nraw); 187 nodat += nraw; 188 nraw = 0; 189 } 190 } 191 192 int 193 rawbyte(int i) 194 { 195 assert(acct == i); 196 if(nraw == 0) 197 rawstart = i; 198 acct++; 199 nraw++; 200 return 1; 201 } 202 203 int 204 refblock(int i, int len, int off) 205 { 206 assert(acct == i); 207 acct += len; 208 if(nraw) 209 flushraw(); 210 if(verbose) 211 fprint(2, "Copy %d+%d from %d\n", i, len, off); 212 Bputint(&bout, len); 213 Bputint(&bout, off); 214 zlength += 4+4; 215 return len; 216 } 217 218 int 219 cmprun(uchar *a, uchar *b, int len) 220 { 221 int i; 222 223 if(a==b) 224 return 0; 225 for(i=0; i<len && a[i]==b[i]; i++) 226 ; 227 return i; 228 } 229 230 int 231 countrle(uchar *a) 232 { 233 uchar a0; 234 uchar *p; 235 236 p = a; 237 a0 = *p; 238 while(*++p == a0) 239 ; 240 return p - a; 241 } 242 243 void 244 compress(void) 245 { 246 int best, i, j, o, rle, run, maxrun, maxoff; 247 ulong sum; 248 Node *n; 249 250 sum = 0; 251 for(i=0; i<win && i<length; i++) 252 sum = (sum*256+data[i])%Prime; 253 for(i=0; i<length-win; ){ 254 maxrun = 0; 255 maxoff = 0; 256 if(verbose) 257 fprint(2, "look %.6lux\n", sum); 258 n = lookup(sum); 259 if(n){ 260 best = -1; 261 for(o=0; o<NOFF; o++){ 262 if(n->offset[o] == -1) 263 break; 264 run = cmprun(data+i, data+n->offset[o], length-i); 265 if(run > maxrun && n->offset[o]+mindist < i){ 266 maxrun = run; 267 maxoff = n->offset[o]; 268 best = o; 269 } 270 } 271 if(best > 0){ 272 o = n->offset[best]; 273 for(j=best; j>0; j--) 274 n->offset[j] = n->offset[j-1]; 275 n->offset[0] = o; 276 } 277 } 278 279 if(maxrun >= minrun) 280 j = i+refblock(i, maxrun, maxoff); 281 else 282 j = i+rawbyte(i); 283 for(; i<j; i++){ 284 /* avoid huge chains from large runs of same byte */ 285 rle = countrle(data+i); 286 if(rle<4) 287 insertnode(sum, i); 288 else if(rle>maxrle[data[i]]){ 289 maxrle[data[i]] = rle; 290 insertnode(sum, i); 291 } 292 sum = (sum*256+data[i+win]) % Prime; 293 sum = (sum + data[i]*outn) % Prime; 294 } 295 } 296 /* could do better here */ 297 for(; i<length; i++) 298 rawbyte(i); 299 flushraw(); 300 } 301 302 void 303 usage(void) 304 { 305 fprint(2, "usage: bflz [-n winsize] [file]\n"); 306 exits("usage"); 307 } 308 309 void 310 main(int argc, char **argv) 311 { 312 int fd, i, n; 313 char buf[10485760]; 314 315 ARGBEGIN{ 316 case 'd': 317 verbose = 1; 318 break; 319 case 's': 320 replacesame = atoi(EARGF(usage())); 321 break; 322 case 'm': 323 mindist = atoi(EARGF(usage())); 324 break; 325 case 'n': 326 win = atoi(EARGF(usage())); 327 minrun = win; 328 break; 329 default: 330 usage(); 331 }ARGEND 332 333 switch(argc){ 334 default: 335 usage(); 336 case 0: 337 fd = 0; 338 break; 339 case 1: 340 if((fd = open(argv[0], OREAD)) < 0) 341 sysfatal("open %s: %r", argv[0]); 342 break; 343 } 344 345 while((n = readn(fd, buf, sizeof buf)) > 0){ 346 data = realloc(data, length+n); 347 if(data == nil) 348 sysfatal("realloc: %r"); 349 memmove(data+length, buf, n); 350 length += n; 351 if(n < sizeof buf) 352 break; 353 } 354 odat = malloc(length); 355 if(odat == nil) 356 sysfatal("malloc: %r"); 357 358 Binit(&bout, 1, OWRITE); 359 Bprint(&bout, "BLZ\n"); 360 Bputint(&bout, length); 361 outn = 1; 362 for(i=0; i<win; i++) 363 outn = (outn * 256) % Prime; 364 365 if(verbose) 366 fprint(2, "256^%d = %.6lux\n", win, outn); 367 outn = Prime - outn; 368 if(verbose) 369 fprint(2, "outn = %.6lux\n", outn); 370 371 compress(); 372 Bwrite(&bout, odat, nodat); 373 Bterm(&bout); 374 fprint(2, "brk %p\n", sbrk(1)); 375 fprint(2, "%d nodes used; %d of %d hash slots used\n", nalloc, nnew, nhash); 376 exits(nil); 377 } 378