xref: /plan9/sys/src/cmd/compress/compress.c (revision 14f51593fd82e19ba95969a8c07ff71131015979)
1 /*
2  * compress - File compression ala IEEE Computer, June 1984.
3  *
4  * Algorithm from "A Technique for High Performance Data Compression",
5  * Terry A. Welch, IEEE Computer Vol 17, No 6 (June 1984), pp 8-19.
6  *
7  * Usage: compress [-dfvc] [-b bits] [file ...]
8  * Inputs:
9  *	-b:	limit the max number of bits/code.
10  *	-c:	write output on stdout, don't remove original.
11  *	-d:	decompress instead.
12  *	-f:	Forces output file to be generated, even if one already
13  *		exists, and even if no space is saved by compressing.
14  *		If -f is not used, the user will be prompted if stdin is
15  *		a tty, otherwise, the output file will not be overwritten.
16  *	-v:	Write compression statistics
17  *
18  * 	file ...: Files to be compressed.  If none specified, stdin is used.
19  * Outputs:
20  *	file.Z:	Compressed form of file with same mode, owner, and utimes
21  * 		or stdout (if stdin used as input)
22  *
23  * Assumptions:
24  *	When filenames are given, replaces with the compressed version
25  *	(.Z suffix) only if the file decreases in size.
26  * Algorithm:
27  * 	Modified Lempel-Ziv method (LZW).  Basically finds common
28  * substrings and replaces them with a variable size code.  This is
29  * deterministic, and can be done on the fly.  Thus, the decompression
30  * procedure needs no input table, but tracks the way the table was built.
31 
32  * Authors:	Spencer W. Thomas	(decvax!harpo!utah-cs!utah-gr!thomas)
33  *		Jim McKie		(decvax!mcvax!jim)
34  *		Steve Davies		(decvax!vax135!petsd!peora!srd)
35  *		Ken Turkowski		(decvax!decwrl!turtlevax!ken)
36  *		James A. Woods		(decvax!ihnp4!ames!jaw)
37  *		Joe Orost		(decvax!vax135!petsd!joe)
38  */
39 #define _PLAN9_SOURCE
40 #define _BSD_EXTENSION
41 #define _POSIX_SOURCE
42 
43 #include <u.h>
44 #include <stdio.h>
45 #include <ctype.h>
46 #include <stdlib.h>
47 #include <unistd.h>
48 #include <string.h>
49 #include <signal.h>
50 #include <utime.h>
51 #include <sys/types.h>
52 #include <sys/stat.h>
53 
54 #define	min(a,b)	((a>b) ? b : a)
55 
56 #define BITS	16
57 #define HSIZE	69001		/* 95% occupancy */
58 
59 /*
60  * a code_int must be able to hold 2**BITS values of type int, and also -1
61  */
62 typedef long	code_int;
63 typedef long	count_int;
64 
65 static char rcs_ident[] = "$Header: compress.c,v 4.0 85/07/30 12:50:00 joe Release $";
66 
67 uchar magic_header[] = { 0x1F, 0x9D };	/* 1F 9D */
68 
69 /* Defines for third byte of header */
70 #define BIT_MASK	0x1f
71 #define BLOCK_MASK	0x80
72 /* Masks 0x40 and 0x20 are free.  I think 0x20 should mean that there is
73 	a fourth header byte (for expansion).
74 */
75 #define INIT_BITS 9			/* initial number of bits/code */
76 
77 #define ARGVAL() (*++(*argv) || (--argc && *++argv))
78 
79 int n_bits;				/* number of bits/code */
80 int maxbits = BITS;			/* user settable max # bits/code */
81 code_int maxcode;			/* maximum code, given n_bits */
82 code_int maxmaxcode = 1 << BITS;	/* should NEVER generate this code */
83 
84 #define MAXCODE(n_bits)	((1 << (n_bits)) - 1)
85 
86 count_int htab[HSIZE];
87 ushort codetab[HSIZE];
88 
89 #define htabof(i)	htab[i]
90 #define codetabof(i)	codetab[i]
91 
92 code_int hsize = HSIZE;			/* for dynamic table sizing */
93 count_int fsize;
94 
95 /*
96  * To save much memory, we overlay the table used by compress() with those
97  * used by decompress().  The tab_prefix table is the same size and type
98  * as the codetab.  The tab_suffix table needs 2**BITS characters.  We
99  * get this from the beginning of htab.  The output stack uses the rest
100  * of htab, and contains characters.  There is plenty of room for any
101  * possible stack (stack used to be 8000 characters).
102  */
103 
104 #define tab_prefixof(i)	codetabof(i)
105 #define tab_suffixof(i)	((uchar *)(htab))[i]
106 #define de_stack		((uchar *)&tab_suffixof(1<<BITS))
107 
108 code_int free_ent = 0;			/* first unused entry */
109 int exit_stat = 0;
110 
111 void	cl_block(void);
112 void	cl_hash(count_int);
113 void	compress(void);
114 void	copystat(char *, char *);
115 void	decompress(void);
116 int	foreground(void);
117 code_int getcode(void);
118 void	onintr(int);
119 void	oops(int);
120 void	output(code_int);
121 void	prratio(FILE *, long, long);
122 void	version(void);
123 void	writeerr(void);
124 
125 void
Usage(void)126 Usage(void)
127 {
128 #ifdef DEBUG
129 	fprintf(stderr,"Usage: compress [-cdfDV] [-b maxbits] [file ...]\n");
130 #else
131 	fprintf(stderr,"Usage: compress [-cdfvV] [-b maxbits] [file ...]\n");
132 #endif /* DEBUG */
133 }
134 
135 int debug = 0;
136 int nomagic = 0;	/* Use a 3-byte magic number header, unless old file */
137 int zcat_flg = 0;	/* Write output on stdout, suppress messages */
138 int quiet = 1;		/* don't tell me about compression */
139 
140 /*
141  * block compression parameters -- after all codes are used up,
142  * and compression rate changes, start over.
143  */
144 int block_compress = BLOCK_MASK;
145 int clear_flg = 0;
146 long ratio = 0;
147 #define CHECK_GAP 10000	/* ratio check interval */
148 count_int checkpoint = CHECK_GAP;
149 /*
150  * the next two codes should not be changed lightly, as they must not
151  * lie within the contiguous general code space.
152  */
153 #define FIRST	257	/* first free entry */
154 #define	CLEAR	256	/* table clear output code */
155 
156 int force = 0;
157 char ofname [100];
158 #ifdef DEBUG
159 int verbose = 0;
160 #endif /* DEBUG */
161 void (*bgnd_flag)(int);
162 
163 int do_decomp = 0;
164 
main(argc,argv)165 main(argc, argv)
166 int argc;
167 char **argv;
168 {
169 	int overwrite = 0;	/* Do not overwrite unless given -f flag */
170 	char tempname[512];
171 	char **filelist, **fileptr;
172 	char *cp;
173 	struct stat statbuf;
174 
175 	if ( (bgnd_flag = signal ( SIGINT, SIG_IGN )) != SIG_IGN ) {
176 		signal(SIGINT, onintr);
177 		signal(SIGSEGV, oops);
178 	}
179 
180 	filelist = fileptr = (char **)(malloc(argc * sizeof(*argv)));
181 	*filelist = NULL;
182 
183 	if((cp = strrchr(argv[0], '/')) != 0)
184 		cp++;
185 	else
186 		cp = argv[0];
187 	if(strcmp(cp, "uncompress") == 0)
188 		do_decomp = 1;
189 	else if(strcmp(cp, "zcat") == 0) {
190 		do_decomp = 1;
191 		zcat_flg = 1;
192 	}
193 
194 	/*
195 	 * Argument Processing
196 	 * All flags are optional.
197 	 * -C	generate output compatible with compress 2.0.
198 	 * -D	debug
199 	 * -V	print Version; debug verbose
200 	 * -b maxbits	maxbits.  If -b is specified, then maxbits MUST be
201 	 *		given also.
202 	 * -c	cat all output to stdout
203 	 * -d	do_decomp
204 	 * -f	force overwrite of output file
205 	 * -n	no header: useful to uncompress old files
206 	 * -v	unquiet
207 	 * if a string is left, must be an input filename.
208 	 */
209 	for (argc--, argv++; argc > 0; argc--, argv++) {
210 	if (**argv == '-') {	/* A flag argument */
211 		while (*++(*argv)) {	/* Process all flags in this arg */
212 		switch (**argv) {
213 		case 'C':
214 			block_compress = 0;
215 			break;
216 #ifdef DEBUG
217 		case 'D':
218 			debug = 1;
219 			break;
220 		case 'V':
221 			verbose = 1;
222 			version();
223 			break;
224 #else
225 		case 'V':
226 			version();
227 			break;
228 #endif
229 		case 'b':
230 			if (!ARGVAL()) {
231 				fprintf(stderr, "Missing maxbits\n");
232 				Usage();
233 				exit(1);
234 			}
235 			maxbits = atoi(*argv);
236 			goto nextarg;
237 		case 'c':
238 			zcat_flg = 1;
239 			break;
240 		case 'd':
241 			do_decomp = 1;
242 			break;
243 		case 'f':
244 		case 'F':
245 			overwrite = 1;
246 			force = 1;
247 			break;
248 		case 'n':
249 			nomagic = 1;
250 			break;
251 		case 'q':
252 			quiet = 1;
253 			break;
254 		case 'v':
255 			quiet = 0;
256 			break;
257 		default:
258 			fprintf(stderr, "Unknown flag: '%c'; ", **argv);
259 			Usage();
260 			exit(1);
261 		}
262 		}
263 	} else {			/* Input file name */
264 		*fileptr++ = *argv;	/* Build input file list */
265 		*fileptr = NULL;
266 		/* process nextarg; */
267 	}
268 nextarg:
269 		continue;
270 	}
271 
272 	if(maxbits < INIT_BITS) maxbits = INIT_BITS;
273 	if (maxbits > BITS) maxbits = BITS;
274 	maxmaxcode = 1 << maxbits;
275 
276 	if (*filelist != NULL) {
277 	for (fileptr = filelist; *fileptr; fileptr++) {
278 		exit_stat = 0;
279 		if (do_decomp != 0) {			/* DECOMPRESSION */
280 		/* Check for .Z suffix */
281 		if (strcmp(*fileptr + strlen(*fileptr) - 2, ".Z") != 0) {
282 			/* No .Z: tack one on */
283 			strcpy(tempname, *fileptr);
284 			strcat(tempname, ".Z");
285 			*fileptr = tempname;
286 		}
287 		/* Open input file */
288 		if ((freopen(*fileptr, "r", stdin)) == NULL) {
289 			perror(*fileptr);
290 			continue;
291 		}
292 		/* Check the magic number */
293 		if (nomagic == 0) {
294 			if ((getchar() != (magic_header[0] & 0xFF))
295 			|| (getchar() != (magic_header[1] & 0xFF))) {
296 				fprintf(stderr, "%s: not in compressed format\n",
297 					*fileptr);
298 				continue;
299 			}
300 			maxbits = getchar();	/* set -b from file */
301 			block_compress = maxbits & BLOCK_MASK;
302 			maxbits &= BIT_MASK;
303 			maxmaxcode = 1 << maxbits;
304 			if(maxbits > BITS) {
305 				fprintf(stderr,
306 		"%s: compressed with %d bits, can only handle %d bits\n",
307 					*fileptr, maxbits, BITS);
308 				continue;
309 			}
310 		}
311 		/* Generate output filename */
312 		strcpy(ofname, *fileptr);
313 		ofname[strlen(*fileptr) - 2] = '\0';  /* Strip off .Z */
314 		} else {				/* COMPRESSION */
315 		if (strcmp(*fileptr + strlen(*fileptr) - 2, ".Z") == 0) {
316 			fprintf(stderr,
317 				"%s: already has .Z suffix -- no change\n",
318 				*fileptr);
319 			continue;
320 		}
321 		/* Open input file */
322 		if ((freopen(*fileptr, "r", stdin)) == NULL) {
323 			perror(*fileptr);
324 			continue;
325 		}
326 		(void) stat(*fileptr, &statbuf);
327 		fsize = (long) statbuf.st_size;
328 		/*
329 		 * tune hash table size for small files -- ad hoc,
330 		 * but the sizes match earlier #defines, which
331 		 * serve as upper bounds on the number of output codes.
332 		 */
333 		hsize = HSIZE;
334 		if (fsize < (1 << 12))
335 			hsize = min(5003, HSIZE);
336 		else if (fsize < (1 << 13))
337 			hsize = min(9001, HSIZE);
338 		else if (fsize < (1 << 14))
339 			hsize = min (18013, HSIZE);
340 		else if (fsize < (1 << 15))
341 			hsize = min (35023, HSIZE);
342 		else if (fsize < 47000)
343 			hsize = min (50021, HSIZE);
344 
345 		/* Generate output filename */
346 		strcpy(ofname, *fileptr);
347 #ifndef BSD4_2
348 		if ((cp=strrchr(ofname,'/')) != NULL)
349 			cp++;
350 		else
351 			cp = ofname;
352 		/*
353 		 *** changed 12 to 25;  should be NAMELEN-3, but I don't want
354 		 * to fight the headers.	ehg 5 Nov 92 **
355 		 */
356 		if (strlen(cp) > 25) {
357 			fprintf(stderr, "%s: filename too long to tack on .Z\n",
358 				cp);
359 			continue;
360 		}
361 #endif
362 		strcat(ofname, ".Z");
363 		}
364 		/* Check for overwrite of existing file */
365 		if (overwrite == 0 && zcat_flg == 0 &&
366 		   stat(ofname, &statbuf) == 0) {
367 			char response[2];
368 
369 			response[0] = 'n';
370 			fprintf(stderr, "%s already exists;", ofname);
371 			if (foreground()) {
372 				fprintf(stderr,
373 				    " do you wish to overwrite %s (y or n)? ",
374 					ofname);
375 				fflush(stderr);
376 				(void) read(2, response, 2);
377 				while (response[1] != '\n')
378 					if (read(2, response+1, 1) < 0) {
379 						/* Ack! */
380 						perror("stderr");
381 						break;
382 					}
383 			}
384 			if (response[0] != 'y') {
385 				fprintf(stderr, "\tnot overwritten\n");
386 				continue;
387 			}
388 		}
389 		if(zcat_flg == 0) {		/* Open output file */
390 			if (freopen(ofname, "w", stdout) == NULL) {
391 				perror(ofname);
392 				continue;
393 			}
394 			if(!quiet)
395 				fprintf(stderr, "%s: ", *fileptr);
396 		}
397 
398 		/* Actually do the compression/decompression */
399 		if (do_decomp == 0)
400 			compress();
401 #ifndef DEBUG
402 		else
403 			decompress();
404 #else
405 		else if (debug == 0)
406 			decompress();
407 		else
408 			printcodes();
409 		if (verbose)
410 			dump_tab();
411 #endif						/* DEBUG */
412 		if(zcat_flg == 0) {
413 			copystat(*fileptr, ofname);	/* Copy stats */
414 			if (exit_stat == 1 || !quiet)
415 				putc('\n', stderr);
416 		}
417 	}
418 	} else {		/* Standard input */
419 	if (do_decomp == 0) {
420 		compress();
421 #ifdef DEBUG
422 		if(verbose)
423 			dump_tab();
424 #endif
425 		if(!quiet)
426 			putc('\n', stderr);
427 	} else {
428 		/* Check the magic number */
429 		if (nomagic == 0) {
430 		if ((getchar()!=(magic_header[0] & 0xFF))
431 		 || (getchar()!=(magic_header[1] & 0xFF))) {
432 			fprintf(stderr, "stdin: not in compressed format\n");
433 			exit(1);
434 		}
435 		maxbits = getchar();	/* set -b from file */
436 		block_compress = maxbits & BLOCK_MASK;
437 		maxbits &= BIT_MASK;
438 		maxmaxcode = 1 << maxbits;
439 		fsize = 100000;		/* assume stdin large for USERMEM */
440 		if(maxbits > BITS) {
441 			fprintf(stderr,
442 			"stdin: compressed with %d bits, can only handle %d bits\n",
443 			maxbits, BITS);
444 			exit(1);
445 		}
446 		}
447 #ifndef DEBUG
448 		decompress();
449 #else
450 		if (debug == 0)
451 			decompress();
452 		else
453 			printcodes();
454 		if (verbose)
455 			dump_tab();
456 #endif						/* DEBUG */
457 	}
458 	}
459 	exit(exit_stat);
460 	return 0;
461 }
462 
463 static int offset;
464 long in_count = 1;		/* length of input */
465 long bytes_out;			/* length of compressed output */
466 long out_count = 0;		/* # of codes output (for debugging) */
467 
468 /*
469  * compress stdin to stdout
470  *
471  * Algorithm:  use open addressing double hashing (no chaining) on the
472  * prefix code / next character combination.  We do a variant of Knuth's
473  * algorithm D (vol. 3, sec. 6.4) along with G. Knott's relatively-prime
474  * secondary probe.  Here, the modular division first probe is gives way
475  * to a faster exclusive-or manipulation.  Also do block compression with
476  * an adaptive reset, whereby the code table is cleared when the compression
477  * ratio decreases, but after the table fills.  The variable-length output
478  * codes are re-sized at this point, and a special CLEAR code is generated
479  * for the decompressor.  Late addition:  construct the table according to
480  * file size for noticeable speed improvement on small files.  Please direct
481  * questions about this implementation to ames!jaw.
482  */
483 void
compress(void)484 compress(void)
485 {
486 	code_int ent, hsize_reg;
487 	code_int i;
488 	int c, disp, hshift;
489 	long fcode;
490 
491 	if (nomagic == 0) {
492 		putchar(magic_header[0]);
493 		putchar(magic_header[1]);
494 		putchar((char)(maxbits | block_compress));
495 		if(ferror(stdout))
496 			writeerr();
497 	}
498 	offset = 0;
499 	bytes_out = 3;		/* includes 3-byte header mojo */
500 	out_count = 0;
501 	clear_flg = 0;
502 	ratio = 0;
503 	in_count = 1;
504 	checkpoint = CHECK_GAP;
505 	maxcode = MAXCODE(n_bits = INIT_BITS);
506 	free_ent = (block_compress? FIRST: 256);
507 
508 	ent = getchar ();
509 
510 	hshift = 0;
511 	for (fcode = (long)hsize;  fcode < 65536L; fcode *= 2)
512 		hshift++;
513 	hshift = 8 - hshift;		/* set hash code range bound */
514 
515 	hsize_reg = hsize;
516 	cl_hash( (count_int) hsize_reg);		/* clear hash table */
517 
518 	while ((c = getchar()) != EOF) {
519 		in_count++;
520 		fcode = (long) (((long) c << maxbits) + ent);
521 	 	i = ((c << hshift) ^ ent);	/* xor hashing */
522 
523 		if (htabof (i) == fcode) {
524 			ent = codetabof(i);
525 			continue;
526 		} else if ((long)htabof(i) < 0 )	/* empty slot */
527 			goto nomatch;
528 	 	disp = hsize_reg - i;	/* secondary hash (after G. Knott) */
529 		if (i == 0)
530 			disp = 1;
531 probe:
532 		if ((i -= disp) < 0)
533 			i += hsize_reg;
534 
535 		if (htabof (i) == fcode) {
536 			ent = codetabof(i);
537 			continue;
538 		}
539 		if ((long)htabof(i) > 0)
540 			goto probe;
541 nomatch:
542 		output((code_int)ent);
543 		out_count++;
544 	 	ent = c;
545 		if (free_ent < maxmaxcode) {
546 	 		codetabof(i) = free_ent++;	/* code -> hashtable */
547 			htabof(i) = fcode;
548 		} else if ((count_int)in_count >= checkpoint && block_compress)
549 			cl_block ();
550 	}
551 	/*
552 	* Put out the final code.
553 	*/
554 	output( (code_int)ent );
555 	out_count++;
556 	output( (code_int)-1 );
557 
558 	/*
559 	* Print out stats on stderr
560 	*/
561 	if(zcat_flg == 0 && !quiet) {
562 #ifdef DEBUG
563 	fprintf( stderr,
564 		"%ld chars in, %ld codes (%ld bytes) out, compression factor: ",
565 		in_count, out_count, bytes_out );
566 	prratio( stderr, in_count, bytes_out );
567 	fprintf( stderr, "\n");
568 	fprintf( stderr, "\tCompression as in compact: " );
569 	prratio( stderr, in_count-bytes_out, in_count );
570 	fprintf( stderr, "\n");
571 	fprintf( stderr, "\tLargest code (of last block) was %d (%d bits)\n",
572 		free_ent - 1, n_bits );
573 #else /* !DEBUG */
574 	fprintf( stderr, "Compression: " );
575 	prratio( stderr, in_count-bytes_out, in_count );
576 #endif /* DEBUG */
577 	}
578 	if(bytes_out > in_count)	/* exit(2) if no savings */
579 		exit_stat = 2;
580 }
581 
582 /*
583  * TAG( output )
584  *
585  * Output the given code.
586  * Inputs:
587  * 	code:	A n_bits-bit integer.  If == -1, then EOF.  This assumes
588  *		that n_bits =< (long)wordsize - 1.
589  * Outputs:
590  * 	Outputs code to the file.
591  * Assumptions:
592  *	Chars are 8 bits long.
593  * Algorithm:
594  * 	Maintain a BITS character long buffer (so that 8 codes will
595  * fit in it exactly).  When the buffer fills up empty it and start over.
596  */
597 
598 static char buf[BITS];
599 
600 uchar lmask[9] = {0xff, 0xfe, 0xfc, 0xf8, 0xf0, 0xe0, 0xc0, 0x80, 0x00};
601 uchar rmask[9] = {0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff};
602 
603 void
output(code)604 output( code )
605 code_int  code;
606 {
607 #ifdef DEBUG
608 	static int col = 0;
609 #endif
610 	int r_off = offset, bits= n_bits;
611 	char *bp = buf;
612 
613 #ifdef DEBUG
614 	if (verbose)
615 		fprintf(stderr, "%5d%c", code,
616 			(col+=6) >= 74? (col = 0, '\n'): ' ');
617 #endif
618 	if (code >= 0) {
619 		/*
620 		 * byte/bit numbering on the VAX is simulated by the
621 		 * following code
622 		 */
623 		/*
624 		 * Get to the first byte.
625 		 */
626 		bp += (r_off >> 3);
627 		r_off &= 7;
628 		/*
629 		 * Since code is always >= 8 bits, only need to mask the first
630 		 * hunk on the left.
631 		 */
632 		*bp = (*bp & rmask[r_off]) | (code << r_off) & lmask[r_off];
633 		bp++;
634 		bits -=  8 - r_off;
635 		code >>= 8 - r_off;
636 		/* Get any 8 bit parts in the middle (<=1 for up to 16 bits). */
637 		if ( bits >= 8 ) {
638 			*bp++ = code;
639 			code >>= 8;
640 			bits -= 8;
641 		}
642 		/* Last bits. */
643 		if(bits)
644 			*bp = code;
645 
646 		offset += n_bits;
647 		if ( offset == (n_bits << 3) ) {
648 			bp = buf;
649 			bits = n_bits;
650 			bytes_out += bits;
651 			do {
652 				putchar(*bp++);
653 			} while(--bits);
654 			offset = 0;
655 		}
656 
657 		/*
658 		 * If the next entry is going to be too big for the code size,
659 		 * then increase it, if possible.
660 		 */
661 		if ( free_ent > maxcode || (clear_flg > 0)) {
662 			/*
663 			* Write the whole buffer, because the input side won't
664 			* discover the size increase until after it has read it.
665 			*/
666 			if ( offset > 0 ) {
667 				if( fwrite( buf, 1, n_bits, stdout ) != n_bits)
668 					writeerr();
669 				bytes_out += n_bits;
670 			}
671 			offset = 0;
672 
673 			if ( clear_flg ) {
674 				maxcode = MAXCODE (n_bits = INIT_BITS);
675 				clear_flg = 0;
676 			} else {
677 				n_bits++;
678 				if ( n_bits == maxbits )
679 					maxcode = maxmaxcode;
680 				else
681 					maxcode = MAXCODE(n_bits);
682 			}
683 #ifdef DEBUG
684 			if ( debug ) {
685 				fprintf(stderr,
686 					"\nChange to %d bits\n", n_bits);
687 				col = 0;
688 			}
689 #endif
690 		}
691 	} else {
692 		/*
693 		 * At EOF, write the rest of the buffer.
694 		 */
695 		if ( offset > 0 )
696 			fwrite( buf, 1, (offset + 7) / 8, stdout );
697 		bytes_out += (offset + 7) / 8;
698 		offset = 0;
699 		fflush( stdout );
700 #ifdef DEBUG
701 		if ( verbose )
702 			fprintf( stderr, "\n" );
703 #endif
704 		if( ferror( stdout ) )
705 			writeerr();
706 	}
707 }
708 
709 /*
710  * Decompress stdin to stdout.  This routine adapts to the codes in the
711  * file building the "string" table on-the-fly; requiring no table to
712  * be stored in the compressed file.  The tables used herein are shared
713  * with those of the compress() routine.  See the definitions above.
714  */
715 void
decompress(void)716 decompress(void)
717 {
718 	int finchar;
719 	code_int code, oldcode, incode;
720 	uchar *stackp;
721 
722 	/*
723 	* As above, initialize the first 256 entries in the table.
724 	*/
725 	maxcode = MAXCODE(n_bits = INIT_BITS);
726 	for (code = 255; code >= 0; code--) {
727 		tab_prefixof(code) = 0;
728 		tab_suffixof(code) = (uchar)code;
729 	}
730 	free_ent = (block_compress? FIRST: 256);
731 
732 	finchar = oldcode = getcode();
733 	if(oldcode == -1)		/* EOF already? */
734 		return;			/* Get out of here */
735 	putchar((char)finchar);		/* first code must be 8 bits = char */
736 	if(ferror(stdout))		/* Crash if can't write */
737 		writeerr();
738 	stackp = de_stack;
739 
740 	while ((code = getcode()) > -1) {
741 		if ((code == CLEAR) && block_compress) {
742 			for (code = 255; code >= 0; code--)
743 				tab_prefixof(code) = 0;
744 			clear_flg = 1;
745 			free_ent = FIRST - 1;
746 			if ((code = getcode()) == -1)	/* O, untimely death! */
747 				break;
748 		}
749 		incode = code;
750 		/*
751 		 * Special case for KwKwK string.
752 		 */
753 		if (code >= free_ent) {
754 			*stackp++ = finchar;
755 			code = oldcode;
756 		}
757 
758 		/*
759 		 * Generate output characters in reverse order
760 		 */
761 		while (code >= 256) {
762 			*stackp++ = tab_suffixof(code);
763 			code = tab_prefixof(code);
764 		}
765 		*stackp++ = finchar = tab_suffixof(code);
766 
767 		/*
768 		 * And put them out in forward order
769 		 */
770 		do {
771 			putchar(*--stackp);
772 		} while (stackp > de_stack);
773 
774 		/*
775 		 * Generate the new entry.
776 		 */
777 		if ( (code=free_ent) < maxmaxcode ) {
778 			tab_prefixof(code) = (ushort)oldcode;
779 			tab_suffixof(code) = finchar;
780 			free_ent = code+1;
781 		}
782 		/*
783 		 * Remember previous code.
784 		 */
785 		oldcode = incode;
786 	}
787 	fflush(stdout);
788 	if(ferror(stdout))
789 		writeerr();
790 }
791 
792 /*
793  * TAG( getcode )
794  *
795  * Read one code from the standard input.  If EOF, return -1.
796  * Inputs:
797  * 	stdin
798  * Outputs:
799  * 	code or -1 is returned.
800  */
801 code_int
getcode()802 getcode()
803 {
804 	int r_off, bits;
805 	code_int code;
806 	static int offset = 0, size = 0;
807 	static uchar buf[BITS];
808 	uchar *bp = buf;
809 
810 	if ( clear_flg > 0 || offset >= size || free_ent > maxcode ) {
811 		/*
812 		 * If the next entry will be too big for the current code
813 		 * size, then we must increase the size.  This implies reading
814 		 * a new buffer full, too.
815 		 */
816 		if ( free_ent > maxcode ) {
817 			n_bits++;
818 			if ( n_bits == maxbits )
819 				maxcode = maxmaxcode; /* won't get any bigger now */
820 			else
821 				maxcode = MAXCODE(n_bits);
822 		}
823 		if ( clear_flg > 0) {
824 			maxcode = MAXCODE(n_bits = INIT_BITS);
825 			clear_flg = 0;
826 		}
827 		size = fread(buf, 1, n_bits, stdin);
828 		if (size <= 0)
829 			return -1;			/* end of file */
830 		offset = 0;
831 		/* Round size down to integral number of codes */
832 		size = (size << 3) - (n_bits - 1);
833 	}
834 	r_off = offset;
835 	bits = n_bits;
836 	/*
837 	 * Get to the first byte.
838 	 */
839 	bp += (r_off >> 3);
840 	r_off &= 7;
841 	/* Get first part (low order bits) */
842 	code = (*bp++ >> r_off);
843 	bits -= (8 - r_off);
844 	r_off = 8 - r_off;		/* now, offset into code word */
845 	/* Get any 8 bit parts in the middle (<=1 for up to 16 bits). */
846 	if (bits >= 8) {
847 		code |= *bp++ << r_off;
848 		r_off += 8;
849 		bits -= 8;
850 	}
851 	/* high order bits. */
852 	code |= (*bp & rmask[bits]) << r_off;
853 	offset += n_bits;
854 	return code;
855 }
856 
857 #ifdef DEBUG
printcodes()858 printcodes()
859 {
860 	/*
861 	* Just print out codes from input file.  For debugging.
862 	*/
863 	code_int code;
864 	int col = 0, bits;
865 
866 	bits = n_bits = INIT_BITS;
867 	maxcode = MAXCODE(n_bits);
868 	free_ent = ((block_compress) ? FIRST : 256 );
869 	while ( ( code = getcode() ) >= 0 ) {
870 	if ( (code == CLEAR) && block_compress ) {
871 			free_ent = FIRST - 1;
872 			clear_flg = 1;
873 	}
874 	else if ( free_ent < maxmaxcode )
875 		free_ent++;
876 	if ( bits != n_bits ) {
877 		fprintf(stderr, "\nChange to %d bits\n", n_bits );
878 		bits = n_bits;
879 		col = 0;
880 	}
881 	fprintf(stderr, "%5d%c", code, (col+=6) >= 74 ? (col = 0, '\n') : ' ' );
882 	}
883 	putc( '\n', stderr );
884 	exit( 0 );
885 }
886 
887 code_int sorttab[1<<BITS];	/* sorted pointers into htab */
888 
889 #define STACK_SIZE	15000
890 
dump_tab()891 dump_tab()	/* dump string table */
892 {
893 	int i, first, c, ent;
894 	int stack_top = STACK_SIZE;
895 
896 	if(do_decomp == 0) {	/* compressing */
897 	int flag = 1;
898 
899 	for(i=0; i<hsize; i++) {	/* build sort pointers */
900 		if((long)htabof(i) >= 0) {
901 			sorttab[codetabof(i)] = i;
902 		}
903 	}
904 	first = block_compress ? FIRST : 256;
905 	for(i = first; i < free_ent; i++) {
906 		fprintf(stderr, "%5d: \"", i);
907 		de_stack[--stack_top] = '\n';
908 		de_stack[--stack_top] = '"';
909 		stack_top = in_stack((htabof(sorttab[i])>>maxbits)&0xff,
910 	stack_top);
911 		for(ent=htabof(sorttab[i]) & ((1<<maxbits)-1);
912 			ent > 256;
913 			ent=htabof(sorttab[ent]) & ((1<<maxbits)-1)) {
914 			stack_top = in_stack(htabof(sorttab[ent]) >> maxbits,
915 						stack_top);
916 		}
917 		stack_top = in_stack(ent, stack_top);
918 		fwrite( &de_stack[stack_top], 1, STACK_SIZE-stack_top, stderr);
919 			stack_top = STACK_SIZE;
920 	}
921 	} else if(!debug) {	/* decompressing */
922 
923 	for ( i = 0; i < free_ent; i++ ) {
924 		ent = i;
925 		c = tab_suffixof(ent);
926 		if ( isascii(c) && isprint(c) )
927 		fprintf( stderr, "%5d: %5d/'%c'  \"",
928 				ent, tab_prefixof(ent), c );
929 		else
930 		fprintf( stderr, "%5d: %5d/\\%03o \"",
931 				ent, tab_prefixof(ent), c );
932 		de_stack[--stack_top] = '\n';
933 		de_stack[--stack_top] = '"';
934 		for ( ; ent != NULL;
935 			ent = (ent >= FIRST ? tab_prefixof(ent) : NULL) ) {
936 		stack_top = in_stack(tab_suffixof(ent), stack_top);
937 		}
938 		fwrite( &de_stack[stack_top], 1, STACK_SIZE - stack_top, stderr );
939 		stack_top = STACK_SIZE;
940 	}
941 	}
942 }
943 
944 int
in_stack(int c,int stack_top)945 in_stack(int c, int stack_top)
946 {
947 	if ( (isascii(c) && isprint(c) && c != '\\') || c == ' ' ) {
948 		de_stack[--stack_top] = c;
949 	} else {
950 		switch( c ) {
951 		case '\n': de_stack[--stack_top] = 'n'; break;
952 		case '\t': de_stack[--stack_top] = 't'; break;
953 		case '\b': de_stack[--stack_top] = 'b'; break;
954 		case '\f': de_stack[--stack_top] = 'f'; break;
955 		case '\r': de_stack[--stack_top] = 'r'; break;
956 		case '\\': de_stack[--stack_top] = '\\'; break;
957 		default:
958 	 	de_stack[--stack_top] = '0' + c % 8;
959 	 	de_stack[--stack_top] = '0' + (c / 8) % 8;
960 	 	de_stack[--stack_top] = '0' + c / 64;
961 	 	break;
962 		}
963 		de_stack[--stack_top] = '\\';
964 	}
965 	return stack_top;
966 }
967 #endif /* DEBUG */
968 
969 void
writeerr(void)970 writeerr(void)
971 {
972 	perror(ofname);
973 	unlink(ofname);
974 	exit(1);
975 }
976 
977 void
copystat(ifname,ofname)978 copystat(ifname, ofname)
979 char *ifname, *ofname;
980 {
981 	int mode;
982 	time_t timep[2];			/* should be struct utimbuf */
983 	struct stat statbuf;
984 
985 	fclose(stdout);
986 	if (stat(ifname, &statbuf)) {		/* Get stat on input file */
987 		perror(ifname);
988 		return;
989 	}
990 	if (!S_ISREG(statbuf.st_mode)) {
991 		if (quiet)
992 			fprintf(stderr, "%s: ", ifname);
993 		fprintf(stderr, " -- not a regular file: unchanged");
994 		exit_stat = 1;
995 	} else if (exit_stat == 2 && !force) {
996 		/* No compression: remove file.Z */
997 		if (!quiet)
998 			fprintf(stderr, " -- file unchanged");
999 	} else {			/* Successful Compression */
1000 		exit_stat = 0;
1001 		mode = statbuf.st_mode & 0777;
1002 		if (chmod(ofname, mode))		/* Copy modes */
1003 			perror(ofname);
1004 		/* Copy ownership */
1005 		chown(ofname, statbuf.st_uid, statbuf.st_gid);
1006 		timep[0] = statbuf.st_atime;
1007 		timep[1] = statbuf.st_mtime;
1008 		/* Update last accessed and modified times */
1009 		utime(ofname, (struct utimbuf *)timep);
1010 //		if (unlink(ifname))	/* Remove input file */
1011 //			perror(ifname);
1012 		return;			/* success */
1013 	}
1014 
1015 	/* Unsuccessful return -- one of the tests failed */
1016 	if (unlink(ofname))
1017 		perror(ofname);
1018 }
1019 
1020 /*
1021  * This routine returns 1 if we are running in the foreground and stderr
1022  * is a tty.
1023  */
1024 int
foreground(void)1025 foreground(void)
1026 {
1027 	if(bgnd_flag)			/* background? */
1028 		return 0;
1029 	else				/* foreground */
1030 		return isatty(2);	/* and stderr is a tty */
1031 }
1032 
1033 void
onintr(int x)1034 onintr(int x)
1035 {
1036 	USED(x);
1037 	unlink(ofname);
1038 	exit(1);
1039 }
1040 
1041 void
oops(int x)1042 oops(int x)		/* wild pointer -- assume bad input */
1043 {
1044 	USED(x);
1045 	if (do_decomp == 1)
1046 		fprintf(stderr, "uncompress: corrupt input\n");
1047 	unlink(ofname);
1048 	exit(1);
1049 }
1050 
1051 void
cl_block(void)1052 cl_block(void)		/* table clear for block compress */
1053 {
1054 	long rat;
1055 
1056 	checkpoint = in_count + CHECK_GAP;
1057 #ifdef DEBUG
1058 	if ( debug ) {
1059 		fprintf ( stderr, "count: %ld, ratio: ", in_count );
1060 		prratio ( stderr, in_count, bytes_out );
1061 		fprintf ( stderr, "\n");
1062 	}
1063 #endif /* DEBUG */
1064 
1065 	if (in_count > 0x007fffff) {	/* shift will overflow */
1066 		rat = bytes_out >> 8;
1067 		if (rat == 0)		/* Don't divide by zero */
1068 			rat = 0x7fffffff;
1069 		else
1070 			rat = in_count / rat;
1071 	} else
1072 		rat = (in_count << 8) / bytes_out;	/* 8 fractional bits */
1073 	if (rat > ratio)
1074 		ratio = rat;
1075 	else {
1076 		ratio = 0;
1077 #ifdef DEBUG
1078 		if (verbose)
1079 			dump_tab();	/* dump string table */
1080 #endif
1081 		cl_hash((count_int)hsize);
1082 		free_ent = FIRST;
1083 		clear_flg = 1;
1084 		output((code_int)CLEAR);
1085 #ifdef DEBUG
1086 		if (debug)
1087 			fprintf(stderr, "clear\n");
1088 #endif /* DEBUG */
1089 	}
1090 }
1091 
1092 void
cl_hash(count_int hsize)1093 cl_hash(count_int hsize)		/* reset code table */
1094 {
1095 	count_int *htab_p = htab+hsize;
1096 	long i;
1097 	long m1 = -1;
1098 
1099 	i = hsize - 16;
1100  	do {				/* might use Sys V memset(3) here */
1101 		*(htab_p-16) = m1;
1102 		*(htab_p-15) = m1;
1103 		*(htab_p-14) = m1;
1104 		*(htab_p-13) = m1;
1105 		*(htab_p-12) = m1;
1106 		*(htab_p-11) = m1;
1107 		*(htab_p-10) = m1;
1108 		*(htab_p-9) = m1;
1109 		*(htab_p-8) = m1;
1110 		*(htab_p-7) = m1;
1111 		*(htab_p-6) = m1;
1112 		*(htab_p-5) = m1;
1113 		*(htab_p-4) = m1;
1114 		*(htab_p-3) = m1;
1115 		*(htab_p-2) = m1;
1116 		*(htab_p-1) = m1;
1117 		htab_p -= 16;
1118 	} while ((i -= 16) >= 0);
1119 	for ( i += 16; i > 0; i-- )
1120 		*--htab_p = m1;
1121 }
1122 
1123 void
prratio(stream,num,den)1124 prratio(stream, num, den)
1125 FILE *stream;
1126 long num, den;
1127 {
1128 	int q;				/* Doesn't need to be long */
1129 
1130 	if(num > 214748L)		/* 2147483647/10000 */
1131 		q = num / (den / 10000L);
1132 	else
1133 		q = 10000L * num / den;	/* Long calculations, though */
1134 	if (q < 0) {
1135 		putc('-', stream);
1136 		q = -q;
1137 	}
1138 	fprintf(stream, "%d.%02d%%", q / 100, q % 100);
1139 }
1140 
1141 void
version(void)1142 version(void)
1143 {
1144 	fprintf(stderr, "%s\n", rcs_ident);
1145 	fprintf(stderr, "Options: ");
1146 #ifdef DEBUG
1147 	fprintf(stderr, "DEBUG, ");
1148 #endif
1149 #ifdef BSD4_2
1150 	fprintf(stderr, "BSD4_2, ");
1151 #endif
1152 	fprintf(stderr, "BITS = %d\n", BITS);
1153 }
1154 
1155 /*
1156  * The revision-history novel:
1157  *
1158  * $Header: compress.c,v 4.0 85/07/30 12:50:00 joe Release $
1159  * $Log:	compress.c,v $
1160  * Revision 4.0  85/07/30  12:50:00  joe
1161  * Removed ferror() calls in output routine on every output except first.
1162  * Prepared for release to the world.
1163  *
1164  * Revision 3.6  85/07/04  01:22:21  joe
1165  * Remove much wasted storage by overlaying hash table with the tables
1166  * used by decompress: tab_suffix[1<<BITS], stack[8000].  Updated USERMEM
1167  * computations.  Fixed dump_tab() DEBUG routine.
1168  *
1169  * Revision 3.5  85/06/30  20:47:21  jaw
1170  * Change hash function to use exclusive-or.  Rip out hash cache.  These
1171  * speedups render the megamemory version defunct, for now.  Make decoder
1172  * stack global.  Parts of the RCS trunks 2.7, 2.6, and 2.1 no longer apply.
1173  *
1174  * Revision 3.4  85/06/27  12:00:00  ken
1175  * Get rid of all floating-point calculations by doing all compression ratio
1176  * calculations in fixed point.
1177  *
1178  * Revision 3.3  85/06/24  21:53:24  joe
1179  * Incorporate portability suggestion for M_XENIX.  Got rid of text on #else
1180  * and #endif lines.  Cleaned up #ifdefs for vax and interdata.
1181  *
1182  * Revision 3.2  85/06/06  21:53:24  jaw
1183  * Incorporate portability suggestions for Z8000, IBM PC/XT from mailing list.
1184  * Default to "quiet" output (no compression statistics).
1185  *
1186  * Revision 3.1  85/05/12  18:56:13  jaw
1187  * Integrate decompress() stack speedups (from early pointer mods by McKie).
1188  * Repair multi-file USERMEM gaffe.  Unify 'force' flags to mimic semantics
1189  * of SVR2 'pack'.  Streamline block-compress table clear logic.  Increase
1190  * output byte count by magic number size.
1191  *
1192  * Revision 3.0	84/11/27  11:50:00  petsd!joe
1193  * Set HSIZE depending on BITS.  Set BITS depending on USERMEM.  Unrolled
1194  * loops in clear routines.  Added "-C" flag for 2.0 compatibility.  Used
1195  * unsigned compares on Perkin-Elmer.  Fixed foreground check.
1196  *
1197  * Revision 2.7	84/11/16  19:35:39  ames!jaw
1198  * Cache common hash codes based on input statistics; this improves
1199  * performance for low-density raster images.  Pass on #ifdef bundle
1200  * from Turkowski.
1201  *
1202  * Revision 2.6	84/11/05  19:18:21  ames!jaw
1203  * Vary size of hash tables to reduce time for small files.
1204  * Tune PDP-11 hash function.
1205  *
1206  * Revision 2.5	84/10/30  20:15:14  ames!jaw
1207  * Junk chaining; replace with the simpler (and, on the VAX, faster)
1208  * double hashing, discussed within.  Make block compression standard.
1209  *
1210  * Revision 2.4	84/10/16  11:11:11  ames!jaw
1211  * Introduce adaptive reset for block compression, to boost the rate
1212  * another several percent.  (See mailing list notes.)
1213  *
1214  * Revision 2.3	84/09/22  22:00:00  petsd!joe
1215  * Implemented "-B" block compress.  Implemented REVERSE sorting of tab_next.
1216  * Bug fix for last bits.  Changed fwrite to putchar loop everywhere.
1217  *
1218  * Revision 2.2	84/09/18  14:12:21  ames!jaw
1219  * Fold in news changes, small machine typedef from thomas,
1220  * #ifdef interdata from joe.
1221  *
1222  * Revision 2.1	84/09/10  12:34:56  ames!jaw
1223  * Configured fast table lookup for 32-bit machines.
1224  * This cuts user time in half for b <= FBITS, and is useful for news batching
1225  * from VAX to PDP sites.  Also sped up decompress() [fwrite->putc] and
1226  * added signal catcher [plus beef in writeerr()] to delete effluvia.
1227  *
1228  * Revision 2.0	84/08/28  22:00:00  petsd!joe
1229  * Add check for foreground before prompting user.  Insert maxbits into
1230  * compressed file.  Force file being uncompressed to end with ".Z".
1231  * Added "-c" flag and "zcat".  Prepared for release.
1232  *
1233  * Revision 1.10  84/08/24  18:28:00  turtlevax!ken
1234  * Will only compress regular files (no directories), added a magic number
1235  * header (plus an undocumented -n flag to handle old files without headers),
1236  * added -f flag to force overwriting of possibly existing destination file,
1237  * otherwise the user is prompted for a response.  Will tack on a .Z to a
1238  * filename if it doesn't have one when decompressing.  Will only replace
1239  * file if it was compressed.
1240  *
1241  * Revision 1.9  84/08/16  17:28:00  turtlevax!ken
1242  * Removed scanargs(), getopt(), added .Z extension and unlimited number of
1243  * filenames to compress.  Flags may be clustered (-Ddvb12) or separated
1244  * (-D -d -v -b 12), or combination thereof.  Modes and other status is
1245  * copied with copystat().  -O bug for 4.2 seems to have disappeared with
1246  * 1.8.
1247  *
1248  * Revision 1.8  84/08/09  23:15:00  joe
1249  * Made it compatible with vax version, installed jim's fixes/enhancements
1250  *
1251  * Revision 1.6  84/08/01  22:08:00  joe
1252  * Sped up algorithm significantly by sorting the compress chain.
1253  *
1254  * Revision 1.5  84/07/13  13:11:00  srd
1255  * Added C version of vax asm routines.  Changed structure to arrays to
1256  * save much memory.  Do unsigned compares where possible (faster on
1257  * Perkin-Elmer)
1258  *
1259  * Revision 1.4  84/07/05  03:11:11  thomas
1260  * Clean up the code a little and lint it.  (Lint complains about all
1261  * the regs used in the asm, but I'm not going to "fix" this.)
1262  *
1263  * Revision 1.3  84/07/05  02:06:54  thomas
1264  * Minor fixes.
1265  *
1266  * Revision 1.2  84/07/05  00:27:27  thomas
1267  * Add variable bit length output.
1268  */
1269