xref: /plan9/sys/src/cmd/gs/src/shcgen.c (revision 593dc095aefb2a85c828727bbfa9da139a49bdf4)
1 /* Copyright (C) 1994, 1996, 1998 Aladdin Enterprises.  All rights reserved.
2 
3   This software is provided AS-IS with no warranty, either express or
4   implied.
5 
6   This software is distributed under license and may not be copied,
7   modified or distributed except as expressly authorized under the terms
8   of the license contained in the file LICENSE in this distribution.
9 
10   For more information about licensing, please refer to
11   http://www.ghostscript.com/licensing/. For information on
12   commercial licensing, go to http://www.artifex.com/licensing/ or
13   contact Artifex Software, Inc., 101 Lucas Valley Road #110,
14   San Rafael, CA  94903, U.S.A., +1(415)492-9861.
15 */
16 
17 /* $Id: shcgen.c,v 1.4 2002/02/21 22:24:54 giles Exp $ */
18 /* Generate (bounded) Huffman code definitions from frequencies, */
19 /* and tables from definitions. */
20 #include "memory_.h"
21 #include "stdio_.h"
22 #include <stdlib.h>		/* for qsort */
23 #include "gdebug.h"
24 #include "gserror.h"
25 #include "gserrors.h"
26 #include "gsmemory.h"
27 #include "scommon.h"
28 #include "shc.h"
29 #include "shcgen.h"
30 
31 /* ------ Frequency -> definition procedure ------ */
32 
33 /* Define a node for the Huffman code tree. */
34 typedef struct count_node_s count_node;
35 struct count_node_s {
36     long freq;			/* frequency of value */
37     uint value;			/* data value being encoded */
38     uint code_length;		/* length of Huffman code */
39     count_node *next;		/* next node in freq-sorted list */
40     count_node *left;		/* left child in tree (smaller code_length) */
41     count_node *right;		/* right child in tree (greater code_length) */
42 };
43 
44 #ifdef DEBUG
45 #  define debug_print_nodes(nodes, n, tag, lengths)\
46      if ( gs_debug_c('W') ) print_nodes_proc(nodes, n, tag, lengths);
47 private void
print_nodes_proc(const count_node * nodes,int n,const char * tag,int lengths)48 print_nodes_proc(const count_node * nodes, int n, const char *tag, int lengths)
49 {
50     int i;
51 
52     dlprintf1("[w]---------------- %s ----------------\n", tag);
53     for (i = 0; i < n; ++i)
54 	dlprintf7("[w]node %d: f=%ld v=%d len=%d N=%d L=%d R=%d\n",
55 		  i, nodes[i].freq, nodes[i].value, nodes[i].code_length,
56 		  (nodes[i].next == 0 ? -1 : (int)(nodes[i].next - nodes)),
57 		  (nodes[i].left == 0 ? -1 : (int)(nodes[i].left - nodes)),
58 		(nodes[i].right == 0 ? -1 : (int)(nodes[i].right - nodes)));
59     for (i = lengths; i > 0;) {
60 	int j = i;
61 	int len = nodes[--j].code_length;
62 
63 	while (j > 0 && nodes[j - 1].code_length == len)
64 	    --j;
65 	dlprintf2("[w]%d codes of length %d\n", i - j, len);
66 	i = j;
67     }
68 }
69 #else
70 #  define debug_print_nodes(nodes, n, tag, lengths) DO_NOTHING
71 #endif
72 
73 /* Node comparison procedures for sorting. */
74 #define pn1 ((const count_node *)p1)
75 #define pn2 ((const count_node *)p2)
76 /* Sort by decreasing frequency. */
77 private int
compare_freqs(const void * p1,const void * p2)78 compare_freqs(const void *p1, const void *p2)
79 {
80     long diff = pn2->freq - pn1->freq;
81 
82     return (diff < 0 ? -1 : diff > 0 ? 1 : 0);
83 }
84 /* Sort by increasing code length, and secondarily by decreasing frequency. */
85 private int
compare_code_lengths(const void * p1,const void * p2)86 compare_code_lengths(const void *p1, const void *p2)
87 {
88     int diff = pn1->code_length - pn2->code_length;
89 
90     return (diff < 0 ? -1 : diff > 0 ? 1 : compare_freqs(p1, p2));
91 }
92 /* Sort by increasing code value. */
93 private int
compare_values(const void * p1,const void * p2)94 compare_values(const void *p1, const void *p2)
95 {
96     return (pn1->value < pn2->value ? -1 :
97 	    pn1->value > pn2->value ? 1 : 0);
98 }
99 #undef pn1
100 #undef pn2
101 
102 /* Adjust code lengths so that none of them exceeds max_length. */
103 /* We break this out just to help organize the code; it's only called */
104 /* from one place in hc_compute. */
105 private void
hc_limit_code_lengths(count_node * nodes,uint num_values,int max_length)106 hc_limit_code_lengths(count_node * nodes, uint num_values, int max_length)
107 {
108     int needed;			/* # of max_length codes we need to free up */
109     count_node *longest = nodes + num_values;
110 
111     {				/* Compute the number of additional max_length codes */
112 	/* we need to make available. */
113 	int length = longest[-1].code_length;
114 	int next_length;
115 	int avail = 0;
116 
117 	while ((next_length = longest[-1].code_length) > max_length) {
118 	    avail >>= length - next_length;
119 	    length = next_length;
120 	    (--longest)->code_length = max_length;
121 	    ++avail;
122 	}
123 	needed = (nodes + num_values - longest) -
124 	    (avail >>= (length - max_length));
125 	if_debug2('W', "[w]avail=%d, needed=%d\n",
126 		  avail, needed);
127     }
128     /* Skip over all max_length codes. */
129     while (longest[-1].code_length == max_length)
130 	--longest;
131 
132     /*
133      * To make available a code of length N, suppose that the next
134      * shortest used code is of length M.
135      * We take the lowest-frequency code of length M and change it
136      * to M+1; we then have to compensate by reducing the length of
137      * some of the highest-frequency codes of length N, as follows:
138      *       M      new lengths for codes of length N
139      *      ---     -----------
140      *      N-1     (none)
141      *      N-2     N-1
142      *      <N-2    M+2, M+2, N-1
143      * In the present situation, N = max_length.
144      */
145     for (; needed > 0; --needed) {	/* longest points to the first code of length max_length. */
146 	/* Since codes are sorted by increasing code length, */
147 	/* longest-1 is the desired code of length M. */
148 	int M1 = ++(longest[-1].code_length);
149 
150 	switch (max_length - M1) {
151 	    case 0:		/* M == N-1 */
152 		--longest;
153 		break;
154 	    case 1:		/* M == N-2 */
155 		longest++->code_length = M1;
156 		break;
157 	    default:
158 		longest->code_length = M1 + 1;
159 		longest[1].code_length = M1 + 1;
160 		longest[2].code_length--;
161 		longest += 3;
162 	}
163     }
164 }
165 
166 /* Compute an optimal Huffman code from an input data set. */
167 /* The client must have set all the elements of *def. */
168 int
hc_compute(hc_definition * def,const long * freqs,gs_memory_t * mem)169 hc_compute(hc_definition * def, const long *freqs, gs_memory_t * mem)
170 {
171     uint num_values = def->num_values;
172     count_node *nodes =
173     (count_node *) gs_alloc_byte_array(mem, num_values * 2 - 1,
174 				       sizeof(count_node), "hc_compute");
175     int i;
176     count_node *lowest;
177     count_node *comb;
178 
179     if (nodes == 0)
180 	return_error(gs_error_VMerror);
181 
182     /* Create leaf nodes for the input data. */
183     for (i = 0; i < num_values; ++i)
184 	nodes[i].freq = freqs[i], nodes[i].value = i;
185 
186     /* Create a list sorted by increasing frequency. */
187     /* Also initialize the tree structure. */
188     qsort(nodes, num_values, sizeof(count_node), compare_freqs);
189     for (i = 0; i < num_values; ++i)
190 	nodes[i].next = &nodes[i - 1],
191 	    nodes[i].code_length = 0,
192 	    nodes[i].left = nodes[i].right = 0;
193     nodes[0].next = 0;
194     debug_print_nodes(nodes, num_values, "after sort", 0);
195 
196     /* Construct the Huffman code tree. */
197     for (lowest = &nodes[num_values - 1], comb = &nodes[num_values];;
198 	 ++comb
199 	) {
200 	count_node *pn1 = lowest;
201 	count_node *pn2 = pn1->next;
202 	long freq = pn1->freq + pn2->freq;
203 
204 	/* Create a parent for the two lowest-frequency nodes. */
205 	lowest = pn2->next;
206 	comb->freq = freq;
207 	if (pn1->code_length <= pn2->code_length)
208 	    comb->left = pn1, comb->right = pn2,
209 		comb->code_length = pn2->code_length + 1;
210 	else
211 	    comb->left = pn2, comb->right = pn1,
212 		comb->code_length = pn1->code_length + 1;
213 	if (lowest == 0)	/* no nodes left to combine */
214 	    break;
215 	/* Insert comb in the sorted list. */
216 	if (freq < lowest->freq)
217 	    comb->next = lowest, lowest = comb;
218 	else {
219 	    count_node *here = lowest;
220 
221 	    while (here->next != 0 && freq >= here->next->freq)
222 		here = here->next;
223 	    comb->next = here->next;
224 	    here->next = comb;
225 	}
226     }
227 
228     /* comb (i.e., &nodes[num_values * 2 - 2] is the root of the tree. */
229     /* Note that the left and right children of an interior node */
230     /* were constructed before, and therefore have lower indices */
231     /* in the nodes array than, the parent node.  Thus we can assign */
232     /* the code lengths (node depths) in a single descending-order */
233     /* sweep. */
234     comb++->code_length = 0;
235     while (comb > nodes + num_values) {
236 	--comb;
237 	comb->left->code_length = comb->right->code_length =
238 	    comb->code_length + 1;
239     }
240     debug_print_nodes(nodes, num_values * 2 - 1, "after combine", 0);
241 
242     /* Sort the leaves again by code length. */
243     qsort(nodes, num_values, sizeof(count_node), compare_code_lengths);
244     debug_print_nodes(nodes, num_values, "after re-sort", num_values);
245 
246     /* Limit the code length to def->num_counts. */
247     hc_limit_code_lengths(nodes, num_values, def->num_counts);
248     debug_print_nodes(nodes, num_values, "after limit", num_values);
249 
250     /* Sort within each code length by increasing code value. */
251     /* This doesn't affect data compression, but it makes */
252     /* the code definition itself compress better using our */
253     /* incremental encoding. */
254     for (i = num_values; i > 0;) {
255 	int j = i;
256 	int len = nodes[--j].code_length;
257 
258 	while (j > 0 && nodes[j - 1].code_length == len)
259 	    --j;
260 	qsort(&nodes[j], i - j, sizeof(count_node), compare_values);
261 	i = j;
262     }
263 
264     /* Extract the definition from the nodes. */
265     memset(def->counts, 0, sizeof(*def->counts) * (def->num_counts + 1));
266     for (i = 0; i < num_values; ++i) {
267 	def->values[i] = nodes[i].value;
268 	def->counts[nodes[i].code_length]++;
269     }
270 
271     /* All done, release working storage. */
272     gs_free_object(mem, nodes, "hc_compute");
273     return 0;
274 }
275 
276 /* ------ Byte string <-> definition procedures ------ */
277 
278 /*
279  * We define a compressed representation for (well-behaved) definitions
280  * as a byte string.  A "well-behaved" definition is one where if
281  * code values A and B have the same code length and A < B,
282  * A precedes B in the values table of the definition, and hence
283  * A's encoding lexicographically precedes B's.
284  *
285  * The successive bytes in the compressed string  give the code lengths for
286  * runs of decoded values, in the form nnnnllll where nnnn is the number of
287  * consecutive values -1 and llll is the code length -1.
288  */
289 
290 /* Convert a definition to a byte string. */
291 /* The caller must provide the byte string, of length def->num_values. */
292 /* Assume (do not check) that the definition is well-behaved. */
293 /* Return the actual length of the string. */
294 int
hc_bytes_from_definition(byte * dbytes,const hc_definition * def)295 hc_bytes_from_definition(byte * dbytes, const hc_definition * def)
296 {
297     int i, j;
298     byte *bp = dbytes;
299     const byte *lp = dbytes;
300     const byte *end = dbytes + def->num_values;
301     const ushort *values = def->values;
302 
303     /* Temporarily use the output string as a map from */
304     /* values to code lengths. */
305     for (i = 1; i <= def->num_counts; i++)
306 	for (j = 0; j < def->counts[i]; j++)
307 	    bp[*values++] = i;
308 
309     /* Now construct the actual string. */
310     while (lp < end) {
311 	const byte *vp;
312 	byte len = *lp;
313 
314 	for (vp = lp + 1; vp < end && vp < lp + 16 && *vp == len;)
315 	    vp++;
316 	*bp++ = ((vp - lp - 1) << 4) + (len - 1);
317 	lp = vp;
318     }
319 
320     return bp - dbytes;
321 }
322 
323 /* Extract num_counts and num_values from a byte string. */
324 void
hc_sizes_from_bytes(hc_definition * def,const byte * dbytes,int num_bytes)325 hc_sizes_from_bytes(hc_definition * def, const byte * dbytes, int num_bytes)
326 {
327     uint num_counts = 0, num_values = 0;
328     int i;
329 
330     for (i = 0; i < num_bytes; i++) {
331 	int n = (dbytes[i] >> 4) + 1;
332 	int l = (dbytes[i] & 15) + 1;
333 
334 	if (l > num_counts)
335 	    num_counts = l;
336 	num_values += n;
337     }
338     def->num_counts = num_counts;
339     def->num_values = num_values;
340 }
341 
342 /* Convert a byte string back to a definition. */
343 /* The caller must initialize *def, including allocating counts and values. */
344 void
hc_definition_from_bytes(hc_definition * def,const byte * dbytes)345 hc_definition_from_bytes(hc_definition * def, const byte * dbytes)
346 {
347     int v, i;
348     ushort counts[max_hc_length + 1];
349 
350     /* Make a first pass to set the counts for each code length. */
351     memset(counts, 0, sizeof(counts[0]) * (def->num_counts + 1));
352     for (i = 0, v = 0; v < def->num_values; i++) {
353 	int n = (dbytes[i] >> 4) + 1;
354 	int l = (dbytes[i] & 15) + 1;
355 
356 	counts[l] += n;
357 	v += n;
358     }
359 
360     /* Now fill in the definition. */
361     memcpy(def->counts, counts, sizeof(counts[0]) * (def->num_counts + 1));
362     for (i = 1, v = 0; i <= def->num_counts; i++) {
363 	uint prev = counts[i];
364 
365 	counts[i] = v;
366 	v += prev;
367     }
368     for (i = 0, v = 0; v < def->num_values; i++) {
369 	int n = (dbytes[i] >> 4) + 1;
370 	int l = (dbytes[i] & 15) + 1;
371 	int j;
372 
373 	for (j = 0; j < n; n++)
374 	    def->values[counts[l]++] = v++;
375     }
376 }
377 
378 /* ------ Definition -> table procedures ------ */
379 
380 /* Generate the encoding table from the definition. */
381 /* The size of the encode array is def->num_values. */
382 void
hc_make_encoding(hce_code * encode,const hc_definition * def)383 hc_make_encoding(hce_code * encode, const hc_definition * def)
384 {
385     uint next = 0;
386     const ushort *pvalue = def->values;
387     uint i, k;
388 
389     for (i = 1; i <= def->num_counts; i++) {
390 	for (k = 0; k < def->counts[i]; k++, pvalue++, next++) {
391 	    hce_code *pce = encode + *pvalue;
392 
393 	    pce->code = next;
394 	    pce->code_length = i;
395 	}
396 	next <<= 1;
397     }
398 }
399 
400 /* We decode in two steps, first indexing into a table with */
401 /* a fixed number of bits from the source, and then indexing into */
402 /* an auxiliary table if necessary.  (See shc.h for details.) */
403 
404 /* Calculate the size of the decoding table. */
405 uint
hc_sizeof_decoding(const hc_definition * def,int initial_bits)406 hc_sizeof_decoding(const hc_definition * def, int initial_bits)
407 {
408     uint size = 1 << initial_bits;
409     uint carry = 0, mask = (uint) ~ 1;
410     uint i;
411 
412     for (i = initial_bits + 1; i <= def->num_counts;
413 	 i++, carry <<= 1, mask <<= 1
414 	) {
415 	carry += def->counts[i];
416 	size += carry & mask;
417 	carry &= ~mask;
418     }
419     return size;
420 }
421 
422 /* Generate the decoding tables. */
423 void
hc_make_decoding(hcd_code * decode,const hc_definition * def,int initial_bits)424 hc_make_decoding(hcd_code * decode, const hc_definition * def,
425 		 int initial_bits)
426 {				/* Make entries for single-dispatch codes. */
427     {
428 	hcd_code *pcd = decode;
429 	const ushort *pvalue = def->values;
430 	uint i, k, d;
431 
432 	for (i = 0; i <= initial_bits; i++) {
433 	    for (k = 0; k < def->counts[i]; k++, pvalue++) {
434 		for (d = 1 << (initial_bits - i); d > 0;
435 		     d--, pcd++
436 		    )
437 		    pcd->value = *pvalue,
438 			pcd->code_length = i;
439 	    }
440 	}
441     }
442     /* Make entries for two-dispatch codes. */
443     /* By working backward, we can do this more easily */
444     /* in a single pass. */
445     {
446 	uint dsize = hc_sizeof_decoding(def, initial_bits);
447 	hcd_code *pcd = decode + (1 << initial_bits);
448 	hcd_code *pcd2 = decode + dsize;
449 	const ushort *pvalue = def->values + def->num_values;
450 	uint entries_left = 0, slots_left = 0, mult_shift = 0;
451 	uint i = def->num_counts + 1, j;
452 
453 	for (;;) {
454 	    if (slots_left == 0) {
455 		if (entries_left != 0) {
456 		    slots_left = 1 << (i - initial_bits);
457 		    mult_shift = 0;
458 		    continue;
459 		}
460 		if (--i <= initial_bits)
461 		    break;
462 		entries_left = def->counts[i];
463 		continue;
464 	    }
465 	    if (entries_left == 0) {
466 		entries_left = def->counts[--i];
467 		mult_shift++;
468 		continue;
469 	    }
470 	    --entries_left, --pvalue;
471 	    for (j = 1 << mult_shift; j > 0; j--) {
472 		--pcd2;
473 		pcd2->value = *pvalue;
474 		pcd2->code_length = i - initial_bits;
475 	    }
476 	    if ((slots_left -= 1 << mult_shift) == 0) {
477 		--pcd;
478 		pcd->value = pcd2 - decode;
479 		pcd->code_length = i + mult_shift;
480 	    }
481 	}
482     }
483 }
484