1 /* 2 * Extraordinarily brute force Lempel & Ziv-like 3 * compressor. The input file must fit in memory 4 * during compression, and the output file will 5 * be reconstructed in memory during decompression. 6 * We search for large common sequences and use a 7 * greedy algorithm to choose which sequence gets 8 * compressed first. 9 * 10 * Files begin with "BLZ\n" and a 32-bit uncompressed file length. 11 * 12 * Output format is a series of blocks followed by 13 * a raw data section. Each block begins with a 32-bit big-endian 14 * number. The top bit is type and the next 31 bits 15 * are uncompressed size. Type is one of 16 * 0 - use raw data for this length 17 * 1 - a 32-bit offset follows 18 * After the blocks come the raw data. (The end of the blocks can be 19 * noted by summing block lengths until you reach the file length.) 20 */ 21 22 #include <u.h> 23 #include <libc.h> 24 #include <bio.h> 25 26 int minrun = 16; 27 int win = 16; 28 ulong outn; 29 int verbose; 30 int mindist; 31 32 enum { Prime = 16777213 }; /* smallest prime < 2^24 (so p*256+256 < 2^32) */ 33 34 Biobuf bout; 35 ulong length; 36 uchar *data; 37 ulong sum32(ulong, void*, long); 38 uchar *odat; 39 int nodat; 40 int nraw; 41 int rawstart; 42 int acct; 43 int zlength; 44 int maxchain; 45 int maxrle[256]; 46 47 typedef struct Node Node; 48 struct Node { 49 ulong key; 50 int offset; 51 Node *left; 52 Node *right; 53 Node *link; 54 }; 55 56 Node *nodepool; 57 int nnodepool; 58 59 Node *root; 60 61 Node* 62 allocnode(void) 63 { 64 if(nodepool == nil){ 65 nnodepool = length+win+1; 66 nodepool = mallocz(sizeof(Node)*nnodepool, 1); 67 } 68 assert(nnodepool > 0); 69 return &nodepool[--nnodepool]; 70 } 71 72 Node** 73 llookup(ulong key, int walkdown) 74 { 75 Node **l; 76 77 l = &root; 78 while(*l){ 79 if(key < (*l)->key) 80 l = &(*l)->left; 81 else if(key > (*l)->key) 82 l = &(*l)->right; 83 else{ 84 if(walkdown){ 85 while(*l) 86 l=&(*l)->link; 87 } 88 break; 89 } 90 } 91 return l; 92 } 93 94 95 void 96 insertnode(ulong key, ulong offset) 97 { 98 Node **l, *n; 99 100 l = llookup(key, 1); 101 n = allocnode(); 102 n->key = key; 103 n->offset = offset; 104 *l = n; 105 } 106 107 Node* 108 lookup(ulong key) 109 { 110 return *llookup(key, 0); 111 } 112 113 void 114 Bputint(Biobufhdr *b, int n) 115 { 116 uchar p[4]; 117 118 p[0] = n>>24; 119 p[1] = n>>16; 120 p[2] = n>>8; 121 p[3] = n; 122 Bwrite(b, p, 4); 123 } 124 125 void 126 flushraw(void) 127 { 128 if(nraw){ 129 if(verbose) 130 fprint(2, "Raw %d+%d\n", rawstart, nraw); 131 zlength += 4+nraw; 132 Bputint(&bout, (1<<31)|nraw); 133 memmove(odat+nodat, data+rawstart, nraw); 134 nodat += nraw; 135 nraw = 0; 136 } 137 } 138 139 int 140 rawbyte(int i) 141 { 142 assert(acct == i); 143 if(nraw == 0) 144 rawstart = i; 145 acct++; 146 nraw++; 147 return 1; 148 } 149 150 int 151 refblock(int i, int len, int off) 152 { 153 assert(acct == i); 154 acct += len; 155 if(nraw) 156 flushraw(); 157 if(verbose) 158 fprint(2, "Copy %d+%d from %d\n", i, len, off); 159 Bputint(&bout, len); 160 Bputint(&bout, off); 161 zlength += 4+4; 162 return len; 163 } 164 165 int 166 cmprun(uchar *a, uchar *b, int len) 167 { 168 int i; 169 170 if(a==b) 171 return 0; 172 for(i=0; i<len && a[i]==b[i]; i++) 173 ; 174 return i; 175 } 176 177 int 178 countrle(uchar *a) 179 { 180 int i; 181 182 for(i=0; a[i]==a[0]; i++) 183 ; 184 return i; 185 } 186 187 void 188 compress(void) 189 { 190 int i, j, rle, run, maxrun, maxoff; 191 ulong sum; 192 Node *n; 193 194 sum = 0; 195 for(i=0; i<win && i<length; i++) 196 sum = (sum*256+data[i])%Prime; 197 for(i=0; i<length-win; ){ 198 n = lookup(sum); 199 maxrun = 0; 200 maxoff = 0; 201 if(verbose) 202 fprint(2, "look %.6lux\n", sum); 203 j=0; 204 for(; n; n=n->link, j++){ 205 run = cmprun(data+i, data+n->offset, length-i); 206 if(verbose) 207 fprint(2, "(%d,%d)%d...", i, n->offset, run); 208 if(run > maxrun && n->offset+mindist < i){ 209 maxrun = run; 210 if(verbose) 211 fprint(2, "[%d] (min %d)\n", maxrun, minrun); 212 maxoff = n->offset; 213 } 214 } 215 if(j > maxchain){ 216 if(verbose) 217 fprint(2, "%.6lux %d\n", sum, j); 218 maxchain = j; 219 } 220 if(maxrun >= minrun) 221 j = i+refblock(i, maxrun, maxoff); 222 else 223 j = i+rawbyte(i); 224 for(; i<j; i++){ 225 /* avoid huge chains from large runs of same byte */ 226 rle = countrle(data+i); 227 if(rle<4) 228 insertnode(sum, i); 229 else if(rle>maxrle[data[i]]){ 230 maxrle[data[i]] = rle; 231 insertnode(sum, i); 232 } 233 sum = (sum*256+data[i+win]) % Prime; 234 sum = (sum + data[i]*outn) % Prime; 235 } 236 } 237 /* could do better here */ 238 for(; i<length; i++) 239 rawbyte(i); 240 flushraw(); 241 } 242 243 void 244 usage(void) 245 { 246 fprint(2, "usage: xzip [-n winsize] [file]\n"); 247 exits("usage"); 248 } 249 250 void 251 main(int argc, char **argv) 252 { 253 int fd, i, n; 254 char buf[10485760]; 255 256 ARGBEGIN{ 257 case 'd': 258 verbose = 1; 259 break; 260 case 'm': 261 mindist = atoi(EARGF(usage())); 262 break; 263 case 'n': 264 win = atoi(EARGF(usage())); 265 minrun = win; 266 break; 267 default: 268 usage(); 269 }ARGEND 270 271 switch(argc){ 272 default: 273 usage(); 274 case 0: 275 fd = 0; 276 break; 277 case 1: 278 if((fd = open(argv[0], OREAD)) < 0) 279 sysfatal("open %s: %r", argv[0]); 280 break; 281 } 282 283 while((n = readn(fd, buf, sizeof buf)) > 0){ 284 data = realloc(data, length+n); 285 if(data == nil) 286 sysfatal("realloc: %r"); 287 memmove(data+length, buf, n); 288 length += n; 289 if(n < sizeof buf) 290 break; 291 } 292 odat = malloc(length); 293 if(odat == nil) 294 sysfatal("malloc: %r"); 295 296 Binit(&bout, 1, OWRITE); 297 Bprint(&bout, "BLZ\n"); 298 Bputint(&bout, length); 299 outn = 1; 300 for(i=0; i<win; i++) 301 outn = (outn * 256) % Prime; 302 303 if(verbose) 304 fprint(2, "256^%d = %.6lux\n", win, outn); 305 outn = Prime - outn; 306 if(verbose) 307 fprint(2, "outn = %.6lux\n", outn); 308 309 compress(); 310 Bwrite(&bout, odat, nodat); 311 Bterm(&bout); 312 exits(nil); 313 } 314