1*86d7f5d3SJohn Marino /*-
2*86d7f5d3SJohn Marino * Copyright (c) 2009 Xin LI <delphij@FreeBSD.org>
3*86d7f5d3SJohn Marino * All rights reserved.
4*86d7f5d3SJohn Marino *
5*86d7f5d3SJohn Marino * Redistribution and use in source and binary forms, with or without
6*86d7f5d3SJohn Marino * modification, are permitted provided that the following conditions
7*86d7f5d3SJohn Marino * are met:
8*86d7f5d3SJohn Marino * 1. Redistributions of source code must retain the above copyright
9*86d7f5d3SJohn Marino * notice, this list of conditions and the following disclaimer.
10*86d7f5d3SJohn Marino * 2. Redistributions in binary form must reproduce the above copyright
11*86d7f5d3SJohn Marino * notice, this list of conditions and the following disclaimer in the
12*86d7f5d3SJohn Marino * documentation and/or other materials provided with the distribution.
13*86d7f5d3SJohn Marino *
14*86d7f5d3SJohn Marino * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15*86d7f5d3SJohn Marino * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16*86d7f5d3SJohn Marino * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17*86d7f5d3SJohn Marino * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18*86d7f5d3SJohn Marino * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19*86d7f5d3SJohn Marino * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20*86d7f5d3SJohn Marino * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21*86d7f5d3SJohn Marino * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22*86d7f5d3SJohn Marino * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23*86d7f5d3SJohn Marino * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24*86d7f5d3SJohn Marino * SUCH DAMAGE.
25*86d7f5d3SJohn Marino *
26*86d7f5d3SJohn Marino * $FreeBSD: head/usr.bin/gzip/unpack.c 194579 2009-06-21 09:39:43Z delphij $
27*86d7f5d3SJohn Marino */
28*86d7f5d3SJohn Marino
29*86d7f5d3SJohn Marino /* This file is #included by gzip.c */
30*86d7f5d3SJohn Marino
31*86d7f5d3SJohn Marino /*
32*86d7f5d3SJohn Marino * pack(1) file format:
33*86d7f5d3SJohn Marino *
34*86d7f5d3SJohn Marino * The first 7 bytes is the header:
35*86d7f5d3SJohn Marino * 00, 01 - Signature (US, RS), we already validated it earlier.
36*86d7f5d3SJohn Marino * 02..05 - Uncompressed size
37*86d7f5d3SJohn Marino * 06 - Level for the huffman tree (<=24)
38*86d7f5d3SJohn Marino *
39*86d7f5d3SJohn Marino * pack(1) will then store symbols (leaf) nodes count in each huffman
40*86d7f5d3SJohn Marino * tree levels, each level would consume 1 byte (See [1]).
41*86d7f5d3SJohn Marino *
42*86d7f5d3SJohn Marino * After the symbol count table, there is the symbol table, storing
43*86d7f5d3SJohn Marino * symbols represented by coresponding leaf node. EOB is not being
44*86d7f5d3SJohn Marino * explicitly transmitted (not necessary anyway) in the symbol table.
45*86d7f5d3SJohn Marino *
46*86d7f5d3SJohn Marino * Compressed data goes after the symbol table.
47*86d7f5d3SJohn Marino *
48*86d7f5d3SJohn Marino * NOTES
49*86d7f5d3SJohn Marino *
50*86d7f5d3SJohn Marino * [1] If we count EOB into the symbols, that would mean that we will
51*86d7f5d3SJohn Marino * have at most 256 symbols in the huffman tree. pack(1) rejects empty
52*86d7f5d3SJohn Marino * file and files that just repeats one character, which means that we
53*86d7f5d3SJohn Marino * will have at least 2 symbols. Therefore, pack(1) would reduce the
54*86d7f5d3SJohn Marino * last level symbol count by 2 which makes it a number in
55*86d7f5d3SJohn Marino * range [0..254], so all levels' symbol count would fit into 1 byte.
56*86d7f5d3SJohn Marino */
57*86d7f5d3SJohn Marino
58*86d7f5d3SJohn Marino #define PACK_HEADER_LENGTH 7
59*86d7f5d3SJohn Marino #define HTREE_MAXLEVEL 24
60*86d7f5d3SJohn Marino
61*86d7f5d3SJohn Marino /*
62*86d7f5d3SJohn Marino * unpack descriptor
63*86d7f5d3SJohn Marino *
64*86d7f5d3SJohn Marino * Represent the huffman tree in a similiar way that pack(1) would
65*86d7f5d3SJohn Marino * store in a packed file. We store all symbols in a linear table,
66*86d7f5d3SJohn Marino * and store pointers to each level's first symbol. In addition to
67*86d7f5d3SJohn Marino * that, maintain two counts for each level: inner nodes count and
68*86d7f5d3SJohn Marino * leaf nodes count.
69*86d7f5d3SJohn Marino */
70*86d7f5d3SJohn Marino typedef struct {
71*86d7f5d3SJohn Marino int symbol_size; /* Size of the symbol table */
72*86d7f5d3SJohn Marino int treelevels; /* Levels for the huffman tree */
73*86d7f5d3SJohn Marino
74*86d7f5d3SJohn Marino int *symbolsin; /* Table of leaf symbols count in
75*86d7f5d3SJohn Marino each level */
76*86d7f5d3SJohn Marino int *inodesin; /* Table of internal nodes count in
77*86d7f5d3SJohn Marino each level */
78*86d7f5d3SJohn Marino
79*86d7f5d3SJohn Marino char *symbol; /* The symbol table */
80*86d7f5d3SJohn Marino char *symbol_eob; /* Pointer to the EOB symbol */
81*86d7f5d3SJohn Marino char **tree; /* Decoding huffman tree (pointers to
82*86d7f5d3SJohn Marino first symbol of each tree level */
83*86d7f5d3SJohn Marino
84*86d7f5d3SJohn Marino off_t uncompressed_size; /* Uncompressed size */
85*86d7f5d3SJohn Marino FILE *fpIn; /* Input stream */
86*86d7f5d3SJohn Marino FILE *fpOut; /* Output stream */
87*86d7f5d3SJohn Marino } unpack_descriptor_t;
88*86d7f5d3SJohn Marino
89*86d7f5d3SJohn Marino /*
90*86d7f5d3SJohn Marino * Release resource allocated to an unpack descriptor.
91*86d7f5d3SJohn Marino *
92*86d7f5d3SJohn Marino * Caller is responsible to make sure that all of these pointers are
93*86d7f5d3SJohn Marino * initialized (in our case, they all point to valid memory block).
94*86d7f5d3SJohn Marino * We don't zero out pointers here because nobody else would ever
95*86d7f5d3SJohn Marino * reference the memory block without scrubing them.
96*86d7f5d3SJohn Marino */
97*86d7f5d3SJohn Marino static void
unpack_descriptor_fini(unpack_descriptor_t * unpackd)98*86d7f5d3SJohn Marino unpack_descriptor_fini(unpack_descriptor_t *unpackd)
99*86d7f5d3SJohn Marino {
100*86d7f5d3SJohn Marino
101*86d7f5d3SJohn Marino free(unpackd->symbolsin);
102*86d7f5d3SJohn Marino free(unpackd->inodesin);
103*86d7f5d3SJohn Marino free(unpackd->symbol);
104*86d7f5d3SJohn Marino free(unpackd->tree);
105*86d7f5d3SJohn Marino
106*86d7f5d3SJohn Marino fclose(unpackd->fpIn);
107*86d7f5d3SJohn Marino fclose(unpackd->fpOut);
108*86d7f5d3SJohn Marino }
109*86d7f5d3SJohn Marino
110*86d7f5d3SJohn Marino /*
111*86d7f5d3SJohn Marino * Recursively fill the internal node count table
112*86d7f5d3SJohn Marino */
113*86d7f5d3SJohn Marino static void
unpackd_fill_inodesin(const unpack_descriptor_t * unpackd,int level)114*86d7f5d3SJohn Marino unpackd_fill_inodesin(const unpack_descriptor_t *unpackd, int level)
115*86d7f5d3SJohn Marino {
116*86d7f5d3SJohn Marino
117*86d7f5d3SJohn Marino /*
118*86d7f5d3SJohn Marino * The internal nodes would be 1/2 of total internal nodes and
119*86d7f5d3SJohn Marino * leaf nodes in the next level. For the last level there
120*86d7f5d3SJohn Marino * would be no internal node by defination.
121*86d7f5d3SJohn Marino */
122*86d7f5d3SJohn Marino if (level < unpackd->treelevels) {
123*86d7f5d3SJohn Marino unpackd_fill_inodesin(unpackd, level + 1);
124*86d7f5d3SJohn Marino unpackd->inodesin[level] = (unpackd->inodesin[level + 1] +
125*86d7f5d3SJohn Marino unpackd->symbolsin[level + 1]) / 2;
126*86d7f5d3SJohn Marino } else
127*86d7f5d3SJohn Marino unpackd->inodesin[level] = 0;
128*86d7f5d3SJohn Marino }
129*86d7f5d3SJohn Marino
130*86d7f5d3SJohn Marino /*
131*86d7f5d3SJohn Marino * Update counter for accepted bytes
132*86d7f5d3SJohn Marino */
133*86d7f5d3SJohn Marino static void
accepted_bytes(off_t * bytes_in,off_t newbytes)134*86d7f5d3SJohn Marino accepted_bytes(off_t *bytes_in, off_t newbytes)
135*86d7f5d3SJohn Marino {
136*86d7f5d3SJohn Marino
137*86d7f5d3SJohn Marino if (bytes_in != NULL)
138*86d7f5d3SJohn Marino (*bytes_in) += newbytes;
139*86d7f5d3SJohn Marino }
140*86d7f5d3SJohn Marino
141*86d7f5d3SJohn Marino /*
142*86d7f5d3SJohn Marino * Read file header and construct the tree. Also, prepare the buffered I/O
143*86d7f5d3SJohn Marino * for decode rountine.
144*86d7f5d3SJohn Marino *
145*86d7f5d3SJohn Marino * Return value is uncompressed size.
146*86d7f5d3SJohn Marino */
147*86d7f5d3SJohn Marino static void
unpack_parse_header(int in,int out,char * pre,size_t prelen,off_t * bytes_in,unpack_descriptor_t * unpackd)148*86d7f5d3SJohn Marino unpack_parse_header(int in, int out, char *pre, size_t prelen, off_t *bytes_in,
149*86d7f5d3SJohn Marino unpack_descriptor_t *unpackd)
150*86d7f5d3SJohn Marino {
151*86d7f5d3SJohn Marino unsigned char hdr[PACK_HEADER_LENGTH]; /* buffer for header */
152*86d7f5d3SJohn Marino ssize_t bytesread; /* Bytes read from the file */
153*86d7f5d3SJohn Marino int i, j, thisbyte;
154*86d7f5d3SJohn Marino
155*86d7f5d3SJohn Marino /* Prepend the header buffer if we already read some data */
156*86d7f5d3SJohn Marino if (prelen != 0)
157*86d7f5d3SJohn Marino memcpy(hdr, pre, prelen);
158*86d7f5d3SJohn Marino
159*86d7f5d3SJohn Marino /* Read in and fill the rest bytes of header */
160*86d7f5d3SJohn Marino bytesread = read(in, hdr + prelen, PACK_HEADER_LENGTH - prelen);
161*86d7f5d3SJohn Marino if (bytesread < 0)
162*86d7f5d3SJohn Marino maybe_err("Error reading pack header");
163*86d7f5d3SJohn Marino
164*86d7f5d3SJohn Marino accepted_bytes(bytes_in, PACK_HEADER_LENGTH);
165*86d7f5d3SJohn Marino
166*86d7f5d3SJohn Marino /* Obtain uncompressed length (bytes 2,3,4,5)*/
167*86d7f5d3SJohn Marino unpackd->uncompressed_size = 0;
168*86d7f5d3SJohn Marino for (i = 2; i <= 5; i++) {
169*86d7f5d3SJohn Marino unpackd->uncompressed_size <<= 8;
170*86d7f5d3SJohn Marino unpackd->uncompressed_size |= hdr[i];
171*86d7f5d3SJohn Marino }
172*86d7f5d3SJohn Marino
173*86d7f5d3SJohn Marino /* Get the levels of the tree */
174*86d7f5d3SJohn Marino unpackd->treelevels = hdr[6];
175*86d7f5d3SJohn Marino if (unpackd->treelevels > HTREE_MAXLEVEL || unpackd->treelevels < 1)
176*86d7f5d3SJohn Marino maybe_errx("Huffman tree has insane levels");
177*86d7f5d3SJohn Marino
178*86d7f5d3SJohn Marino /* Let libc take care for buffering from now on */
179*86d7f5d3SJohn Marino if ((unpackd->fpIn = fdopen(in, "r")) == NULL)
180*86d7f5d3SJohn Marino maybe_err("Can not fdopen() input stream");
181*86d7f5d3SJohn Marino if ((unpackd->fpOut = fdopen(out, "w")) == NULL)
182*86d7f5d3SJohn Marino maybe_err("Can not fdopen() output stream");
183*86d7f5d3SJohn Marino
184*86d7f5d3SJohn Marino /* Allocate for the tables of bounds and the tree itself */
185*86d7f5d3SJohn Marino unpackd->inodesin =
186*86d7f5d3SJohn Marino calloc(unpackd->treelevels, sizeof(*(unpackd->inodesin)));
187*86d7f5d3SJohn Marino unpackd->symbolsin =
188*86d7f5d3SJohn Marino calloc(unpackd->treelevels, sizeof(*(unpackd->symbolsin)));
189*86d7f5d3SJohn Marino unpackd->tree =
190*86d7f5d3SJohn Marino calloc(unpackd->treelevels, (sizeof (*(unpackd->tree))));
191*86d7f5d3SJohn Marino if (unpackd->inodesin == NULL || unpackd->symbolsin == NULL ||
192*86d7f5d3SJohn Marino unpackd->tree == NULL)
193*86d7f5d3SJohn Marino maybe_err("calloc");
194*86d7f5d3SJohn Marino
195*86d7f5d3SJohn Marino /* We count from 0 so adjust to match array upper bound */
196*86d7f5d3SJohn Marino unpackd->treelevels--;
197*86d7f5d3SJohn Marino
198*86d7f5d3SJohn Marino /* Read the levels symbol count table and caculate total */
199*86d7f5d3SJohn Marino unpackd->symbol_size = 1; /* EOB */
200*86d7f5d3SJohn Marino for (i = 0; i <= unpackd->treelevels; i++) {
201*86d7f5d3SJohn Marino if ((thisbyte = fgetc(unpackd->fpIn)) == EOF)
202*86d7f5d3SJohn Marino maybe_err("File appears to be truncated");
203*86d7f5d3SJohn Marino unpackd->symbolsin[i] = (unsigned char)thisbyte;
204*86d7f5d3SJohn Marino unpackd->symbol_size += unpackd->symbolsin[i];
205*86d7f5d3SJohn Marino }
206*86d7f5d3SJohn Marino accepted_bytes(bytes_in, unpackd->treelevels);
207*86d7f5d3SJohn Marino if (unpackd->symbol_size > 256)
208*86d7f5d3SJohn Marino maybe_errx("Bad symbol table");
209*86d7f5d3SJohn Marino
210*86d7f5d3SJohn Marino /* Allocate for the symbol table, point symbol_eob at the beginning */
211*86d7f5d3SJohn Marino unpackd->symbol_eob = unpackd->symbol = calloc(1, unpackd->symbol_size);
212*86d7f5d3SJohn Marino if (unpackd->symbol == NULL)
213*86d7f5d3SJohn Marino maybe_err("calloc");
214*86d7f5d3SJohn Marino
215*86d7f5d3SJohn Marino /*
216*86d7f5d3SJohn Marino * Read in the symbol table, which contain [2, 256] symbols.
217*86d7f5d3SJohn Marino * In order to fit the count in one byte, pack(1) would offset
218*86d7f5d3SJohn Marino * it by reducing 2 from the actual number from the last level.
219*86d7f5d3SJohn Marino *
220*86d7f5d3SJohn Marino * We adjust the last level's symbol count by 1 here, because
221*86d7f5d3SJohn Marino * the EOB symbol is not being transmitted explicitly. Another
222*86d7f5d3SJohn Marino * adjustment would be done later afterward.
223*86d7f5d3SJohn Marino */
224*86d7f5d3SJohn Marino unpackd->symbolsin[unpackd->treelevels]++;
225*86d7f5d3SJohn Marino for (i = 0; i <= unpackd->treelevels; i++) {
226*86d7f5d3SJohn Marino unpackd->tree[i] = unpackd->symbol_eob;
227*86d7f5d3SJohn Marino for (j = 0; j < unpackd->symbolsin[i]; j++) {
228*86d7f5d3SJohn Marino if ((thisbyte = fgetc(unpackd->fpIn)) == EOF)
229*86d7f5d3SJohn Marino maybe_errx("Symbol table truncated");
230*86d7f5d3SJohn Marino *unpackd->symbol_eob++ = (char)thisbyte;
231*86d7f5d3SJohn Marino }
232*86d7f5d3SJohn Marino accepted_bytes(bytes_in, unpackd->symbolsin[i]);
233*86d7f5d3SJohn Marino }
234*86d7f5d3SJohn Marino
235*86d7f5d3SJohn Marino /* Now, take account for the EOB symbol as well */
236*86d7f5d3SJohn Marino unpackd->symbolsin[unpackd->treelevels]++;
237*86d7f5d3SJohn Marino
238*86d7f5d3SJohn Marino /*
239*86d7f5d3SJohn Marino * The symbolsin table has been constructed now.
240*86d7f5d3SJohn Marino * Caculate the internal nodes count table based on it.
241*86d7f5d3SJohn Marino */
242*86d7f5d3SJohn Marino unpackd_fill_inodesin(unpackd, 0);
243*86d7f5d3SJohn Marino }
244*86d7f5d3SJohn Marino
245*86d7f5d3SJohn Marino /*
246*86d7f5d3SJohn Marino * Decode huffman stream, based on the huffman tree.
247*86d7f5d3SJohn Marino */
248*86d7f5d3SJohn Marino static void
unpack_decode(const unpack_descriptor_t * unpackd,off_t * bytes_in)249*86d7f5d3SJohn Marino unpack_decode(const unpack_descriptor_t *unpackd, off_t *bytes_in)
250*86d7f5d3SJohn Marino {
251*86d7f5d3SJohn Marino int thislevel, thiscode, thisbyte, inlevelindex;
252*86d7f5d3SJohn Marino int i;
253*86d7f5d3SJohn Marino off_t bytes_out = 0;
254*86d7f5d3SJohn Marino const char *thissymbol; /* The symbol pointer decoded from stream */
255*86d7f5d3SJohn Marino
256*86d7f5d3SJohn Marino /*
257*86d7f5d3SJohn Marino * Decode huffman. Fetch every bytes from the file, get it
258*86d7f5d3SJohn Marino * into 'thiscode' bit-by-bit, then output the symbol we got
259*86d7f5d3SJohn Marino * when one has been found.
260*86d7f5d3SJohn Marino *
261*86d7f5d3SJohn Marino * Assumption: sizeof(int) > ((max tree levels + 1) / 8).
262*86d7f5d3SJohn Marino * bad things could happen if not.
263*86d7f5d3SJohn Marino */
264*86d7f5d3SJohn Marino thislevel = 0;
265*86d7f5d3SJohn Marino thiscode = thisbyte = 0;
266*86d7f5d3SJohn Marino
267*86d7f5d3SJohn Marino while ((thisbyte = fgetc(unpackd->fpIn)) != EOF) {
268*86d7f5d3SJohn Marino accepted_bytes(bytes_in, 1);
269*86d7f5d3SJohn Marino
270*86d7f5d3SJohn Marino /*
271*86d7f5d3SJohn Marino * Split one bit from thisbyte, from highest to lowest,
272*86d7f5d3SJohn Marino * feed the bit into thiscode, until we got a symbol from
273*86d7f5d3SJohn Marino * the tree.
274*86d7f5d3SJohn Marino */
275*86d7f5d3SJohn Marino for (i = 7; i >= 0; i--) {
276*86d7f5d3SJohn Marino thiscode = (thiscode << 1) | ((thisbyte >> i) & 1);
277*86d7f5d3SJohn Marino
278*86d7f5d3SJohn Marino /* Did we got a symbol? (referencing leaf node) */
279*86d7f5d3SJohn Marino if (thiscode >= unpackd->inodesin[thislevel]) {
280*86d7f5d3SJohn Marino inlevelindex =
281*86d7f5d3SJohn Marino thiscode - unpackd->inodesin[thislevel];
282*86d7f5d3SJohn Marino if (inlevelindex > unpackd->symbolsin[thislevel])
283*86d7f5d3SJohn Marino maybe_errx("File corrupt");
284*86d7f5d3SJohn Marino
285*86d7f5d3SJohn Marino thissymbol =
286*86d7f5d3SJohn Marino &(unpackd->tree[thislevel][inlevelindex]);
287*86d7f5d3SJohn Marino if ((thissymbol == unpackd->symbol_eob) &&
288*86d7f5d3SJohn Marino (bytes_out == unpackd->uncompressed_size))
289*86d7f5d3SJohn Marino goto finished;
290*86d7f5d3SJohn Marino
291*86d7f5d3SJohn Marino fputc((*thissymbol), unpackd->fpOut);
292*86d7f5d3SJohn Marino bytes_out++;
293*86d7f5d3SJohn Marino
294*86d7f5d3SJohn Marino /* Prepare for next input */
295*86d7f5d3SJohn Marino thislevel = 0; thiscode = 0;
296*86d7f5d3SJohn Marino } else {
297*86d7f5d3SJohn Marino thislevel++;
298*86d7f5d3SJohn Marino if (thislevel > unpackd->treelevels)
299*86d7f5d3SJohn Marino maybe_errx("File corrupt");
300*86d7f5d3SJohn Marino }
301*86d7f5d3SJohn Marino }
302*86d7f5d3SJohn Marino }
303*86d7f5d3SJohn Marino
304*86d7f5d3SJohn Marino finished:
305*86d7f5d3SJohn Marino if (bytes_out != unpackd->uncompressed_size)
306*86d7f5d3SJohn Marino maybe_errx("Premature EOF");
307*86d7f5d3SJohn Marino }
308*86d7f5d3SJohn Marino
309*86d7f5d3SJohn Marino /* Handler for pack(1)'ed file */
310*86d7f5d3SJohn Marino static off_t
unpack(int in,int out,char * pre,size_t prelen,off_t * bytes_in)311*86d7f5d3SJohn Marino unpack(int in, int out, char *pre, size_t prelen, off_t *bytes_in)
312*86d7f5d3SJohn Marino {
313*86d7f5d3SJohn Marino unpack_descriptor_t unpackd;
314*86d7f5d3SJohn Marino
315*86d7f5d3SJohn Marino unpack_parse_header(dup(in), dup(out), pre, prelen, bytes_in, &unpackd);
316*86d7f5d3SJohn Marino unpack_decode(&unpackd, bytes_in);
317*86d7f5d3SJohn Marino unpack_descriptor_fini(&unpackd);
318*86d7f5d3SJohn Marino
319*86d7f5d3SJohn Marino /* If we reached here, the unpack was successful */
320*86d7f5d3SJohn Marino return (unpackd.uncompressed_size);
321*86d7f5d3SJohn Marino }
322