1212397c6Schristos /* gzjoin -- command to join gzip files into one gzip file
2212397c6Schristos
3*ba340e45Schristos Copyright (C) 2004, 2005, 2012 Mark Adler, all rights reserved
4*ba340e45Schristos version 1.2, 14 Aug 2012
5212397c6Schristos
6212397c6Schristos This software is provided 'as-is', without any express or implied
7212397c6Schristos warranty. In no event will the author be held liable for any damages
8212397c6Schristos arising from the use of this software.
9212397c6Schristos
10212397c6Schristos Permission is granted to anyone to use this software for any purpose,
11212397c6Schristos including commercial applications, and to alter it and redistribute it
12212397c6Schristos freely, subject to the following restrictions:
13212397c6Schristos
14212397c6Schristos 1. The origin of this software must not be misrepresented; you must not
15212397c6Schristos claim that you wrote the original software. If you use this software
16212397c6Schristos in a product, an acknowledgment in the product documentation would be
17212397c6Schristos appreciated but is not required.
18212397c6Schristos 2. Altered source versions must be plainly marked as such, and must not be
19212397c6Schristos misrepresented as being the original software.
20212397c6Schristos 3. This notice may not be removed or altered from any source distribution.
21212397c6Schristos
22212397c6Schristos Mark Adler madler@alumni.caltech.edu
23212397c6Schristos */
24212397c6Schristos
25212397c6Schristos /*
26212397c6Schristos * Change history:
27212397c6Schristos *
28212397c6Schristos * 1.0 11 Dec 2004 - First version
29212397c6Schristos * 1.1 12 Jun 2005 - Changed ssize_t to long for portability
30*ba340e45Schristos * 1.2 14 Aug 2012 - Clean up for z_const usage
31212397c6Schristos */
32212397c6Schristos
33212397c6Schristos /*
34212397c6Schristos gzjoin takes one or more gzip files on the command line and writes out a
35212397c6Schristos single gzip file that will uncompress to the concatenation of the
36212397c6Schristos uncompressed data from the individual gzip files. gzjoin does this without
37212397c6Schristos having to recompress any of the data and without having to calculate a new
38212397c6Schristos crc32 for the concatenated uncompressed data. gzjoin does however have to
39212397c6Schristos decompress all of the input data in order to find the bits in the compressed
40212397c6Schristos data that need to be modified to concatenate the streams.
41212397c6Schristos
42212397c6Schristos gzjoin does not do an integrity check on the input gzip files other than
43212397c6Schristos checking the gzip header and decompressing the compressed data. They are
44212397c6Schristos otherwise assumed to be complete and correct.
45212397c6Schristos
46212397c6Schristos Each joint between gzip files removes at least 18 bytes of previous trailer
47212397c6Schristos and subsequent header, and inserts an average of about three bytes to the
48212397c6Schristos compressed data in order to connect the streams. The output gzip file
49212397c6Schristos has a minimal ten-byte gzip header with no file name or modification time.
50212397c6Schristos
51212397c6Schristos This program was written to illustrate the use of the Z_BLOCK option of
52212397c6Schristos inflate() and the crc32_combine() function. gzjoin will not compile with
53212397c6Schristos versions of zlib earlier than 1.2.3.
54212397c6Schristos */
55212397c6Schristos
56212397c6Schristos #include <stdio.h> /* fputs(), fprintf(), fwrite(), putc() */
57212397c6Schristos #include <stdlib.h> /* exit(), malloc(), free() */
58212397c6Schristos #include <fcntl.h> /* open() */
59212397c6Schristos #include <unistd.h> /* close(), read(), lseek() */
60212397c6Schristos #include "zlib.h"
61212397c6Schristos /* crc32(), crc32_combine(), inflateInit2(), inflate(), inflateEnd() */
62212397c6Schristos
63212397c6Schristos #define local static
64212397c6Schristos
65212397c6Schristos /* exit with an error (return a value to allow use in an expression) */
bail(char * why1,char * why2)66212397c6Schristos local int bail(char *why1, char *why2)
67212397c6Schristos {
68212397c6Schristos fprintf(stderr, "gzjoin error: %s%s, output incomplete\n", why1, why2);
69212397c6Schristos exit(1);
70212397c6Schristos return 0;
71212397c6Schristos }
72212397c6Schristos
73212397c6Schristos /* -- simple buffered file input with access to the buffer -- */
74212397c6Schristos
75212397c6Schristos #define CHUNK 32768 /* must be a power of two and fit in unsigned */
76212397c6Schristos
77212397c6Schristos /* bin buffered input file type */
78212397c6Schristos typedef struct {
79212397c6Schristos char *name; /* name of file for error messages */
80212397c6Schristos int fd; /* file descriptor */
81212397c6Schristos unsigned left; /* bytes remaining at next */
82212397c6Schristos unsigned char *next; /* next byte to read */
83212397c6Schristos unsigned char *buf; /* allocated buffer of length CHUNK */
84212397c6Schristos } bin;
85212397c6Schristos
86212397c6Schristos /* close a buffered file and free allocated memory */
bclose(bin * in)87212397c6Schristos local void bclose(bin *in)
88212397c6Schristos {
89212397c6Schristos if (in != NULL) {
90212397c6Schristos if (in->fd != -1)
91212397c6Schristos close(in->fd);
92212397c6Schristos if (in->buf != NULL)
93212397c6Schristos free(in->buf);
94212397c6Schristos free(in);
95212397c6Schristos }
96212397c6Schristos }
97212397c6Schristos
98212397c6Schristos /* open a buffered file for input, return a pointer to type bin, or NULL on
99212397c6Schristos failure */
bopen(char * name)100212397c6Schristos local bin *bopen(char *name)
101212397c6Schristos {
102212397c6Schristos bin *in;
103212397c6Schristos
104212397c6Schristos in = malloc(sizeof(bin));
105212397c6Schristos if (in == NULL)
106212397c6Schristos return NULL;
107212397c6Schristos in->buf = malloc(CHUNK);
108212397c6Schristos in->fd = open(name, O_RDONLY, 0);
109212397c6Schristos if (in->buf == NULL || in->fd == -1) {
110212397c6Schristos bclose(in);
111212397c6Schristos return NULL;
112212397c6Schristos }
113212397c6Schristos in->left = 0;
114212397c6Schristos in->next = in->buf;
115212397c6Schristos in->name = name;
116212397c6Schristos return in;
117212397c6Schristos }
118212397c6Schristos
119212397c6Schristos /* load buffer from file, return -1 on read error, 0 or 1 on success, with
120212397c6Schristos 1 indicating that end-of-file was reached */
bload(bin * in)121212397c6Schristos local int bload(bin *in)
122212397c6Schristos {
123212397c6Schristos long len;
124212397c6Schristos
125212397c6Schristos if (in == NULL)
126212397c6Schristos return -1;
127212397c6Schristos if (in->left != 0)
128212397c6Schristos return 0;
129212397c6Schristos in->next = in->buf;
130212397c6Schristos do {
131212397c6Schristos len = (long)read(in->fd, in->buf + in->left, CHUNK - in->left);
132212397c6Schristos if (len < 0)
133212397c6Schristos return -1;
134212397c6Schristos in->left += (unsigned)len;
135212397c6Schristos } while (len != 0 && in->left < CHUNK);
136212397c6Schristos return len == 0 ? 1 : 0;
137212397c6Schristos }
138212397c6Schristos
139212397c6Schristos /* get a byte from the file, bail if end of file */
140212397c6Schristos #define bget(in) (in->left ? 0 : bload(in), \
141212397c6Schristos in->left ? (in->left--, *(in->next)++) : \
142212397c6Schristos bail("unexpected end of file on ", in->name))
143212397c6Schristos
144212397c6Schristos /* get a four-byte little-endian unsigned integer from file */
bget4(bin * in)145212397c6Schristos local unsigned long bget4(bin *in)
146212397c6Schristos {
147212397c6Schristos unsigned long val;
148212397c6Schristos
149212397c6Schristos val = bget(in);
150212397c6Schristos val += (unsigned long)(bget(in)) << 8;
151212397c6Schristos val += (unsigned long)(bget(in)) << 16;
152212397c6Schristos val += (unsigned long)(bget(in)) << 24;
153212397c6Schristos return val;
154212397c6Schristos }
155212397c6Schristos
156212397c6Schristos /* skip bytes in file */
bskip(bin * in,unsigned skip)157212397c6Schristos local void bskip(bin *in, unsigned skip)
158212397c6Schristos {
159212397c6Schristos /* check pointer */
160212397c6Schristos if (in == NULL)
161212397c6Schristos return;
162212397c6Schristos
163212397c6Schristos /* easy case -- skip bytes in buffer */
164212397c6Schristos if (skip <= in->left) {
165212397c6Schristos in->left -= skip;
166212397c6Schristos in->next += skip;
167212397c6Schristos return;
168212397c6Schristos }
169212397c6Schristos
170212397c6Schristos /* skip what's in buffer, discard buffer contents */
171212397c6Schristos skip -= in->left;
172212397c6Schristos in->left = 0;
173212397c6Schristos
174212397c6Schristos /* seek past multiples of CHUNK bytes */
175212397c6Schristos if (skip > CHUNK) {
176212397c6Schristos unsigned left;
177212397c6Schristos
178212397c6Schristos left = skip & (CHUNK - 1);
179212397c6Schristos if (left == 0) {
180212397c6Schristos /* exact number of chunks: seek all the way minus one byte to check
181212397c6Schristos for end-of-file with a read */
182212397c6Schristos lseek(in->fd, skip - 1, SEEK_CUR);
183212397c6Schristos if (read(in->fd, in->buf, 1) != 1)
184212397c6Schristos bail("unexpected end of file on ", in->name);
185212397c6Schristos return;
186212397c6Schristos }
187212397c6Schristos
188212397c6Schristos /* skip the integral chunks, update skip with remainder */
189212397c6Schristos lseek(in->fd, skip - left, SEEK_CUR);
190212397c6Schristos skip = left;
191212397c6Schristos }
192212397c6Schristos
193212397c6Schristos /* read more input and skip remainder */
194212397c6Schristos bload(in);
195212397c6Schristos if (skip > in->left)
196212397c6Schristos bail("unexpected end of file on ", in->name);
197212397c6Schristos in->left -= skip;
198212397c6Schristos in->next += skip;
199212397c6Schristos }
200212397c6Schristos
201212397c6Schristos /* -- end of buffered input functions -- */
202212397c6Schristos
203212397c6Schristos /* skip the gzip header from file in */
gzhead(bin * in)204212397c6Schristos local void gzhead(bin *in)
205212397c6Schristos {
206212397c6Schristos int flags;
207212397c6Schristos
208212397c6Schristos /* verify gzip magic header and compression method */
209212397c6Schristos if (bget(in) != 0x1f || bget(in) != 0x8b || bget(in) != 8)
210212397c6Schristos bail(in->name, " is not a valid gzip file");
211212397c6Schristos
212212397c6Schristos /* get and verify flags */
213212397c6Schristos flags = bget(in);
214212397c6Schristos if ((flags & 0xe0) != 0)
215212397c6Schristos bail("unknown reserved bits set in ", in->name);
216212397c6Schristos
217212397c6Schristos /* skip modification time, extra flags, and os */
218212397c6Schristos bskip(in, 6);
219212397c6Schristos
220212397c6Schristos /* skip extra field if present */
221212397c6Schristos if (flags & 4) {
222212397c6Schristos unsigned len;
223212397c6Schristos
224212397c6Schristos len = bget(in);
225212397c6Schristos len += (unsigned)(bget(in)) << 8;
226212397c6Schristos bskip(in, len);
227212397c6Schristos }
228212397c6Schristos
229212397c6Schristos /* skip file name if present */
230212397c6Schristos if (flags & 8)
231212397c6Schristos while (bget(in) != 0)
232212397c6Schristos ;
233212397c6Schristos
234212397c6Schristos /* skip comment if present */
235212397c6Schristos if (flags & 16)
236212397c6Schristos while (bget(in) != 0)
237212397c6Schristos ;
238212397c6Schristos
239212397c6Schristos /* skip header crc if present */
240212397c6Schristos if (flags & 2)
241212397c6Schristos bskip(in, 2);
242212397c6Schristos }
243212397c6Schristos
244212397c6Schristos /* write a four-byte little-endian unsigned integer to out */
put4(unsigned long val,FILE * out)245212397c6Schristos local void put4(unsigned long val, FILE *out)
246212397c6Schristos {
247212397c6Schristos putc(val & 0xff, out);
248212397c6Schristos putc((val >> 8) & 0xff, out);
249212397c6Schristos putc((val >> 16) & 0xff, out);
250212397c6Schristos putc((val >> 24) & 0xff, out);
251212397c6Schristos }
252212397c6Schristos
253212397c6Schristos /* Load up zlib stream from buffered input, bail if end of file */
zpull(z_streamp strm,bin * in)254212397c6Schristos local void zpull(z_streamp strm, bin *in)
255212397c6Schristos {
256212397c6Schristos if (in->left == 0)
257212397c6Schristos bload(in);
258212397c6Schristos if (in->left == 0)
259212397c6Schristos bail("unexpected end of file on ", in->name);
260212397c6Schristos strm->avail_in = in->left;
261212397c6Schristos strm->next_in = in->next;
262212397c6Schristos }
263212397c6Schristos
264212397c6Schristos /* Write header for gzip file to out and initialize trailer. */
gzinit(unsigned long * crc,unsigned long * tot,FILE * out)265212397c6Schristos local void gzinit(unsigned long *crc, unsigned long *tot, FILE *out)
266212397c6Schristos {
267212397c6Schristos fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out);
268212397c6Schristos *crc = crc32(0L, Z_NULL, 0);
269212397c6Schristos *tot = 0;
270212397c6Schristos }
271212397c6Schristos
272212397c6Schristos /* Copy the compressed data from name, zeroing the last block bit of the last
273212397c6Schristos block if clr is true, and adding empty blocks as needed to get to a byte
274212397c6Schristos boundary. If clr is false, then the last block becomes the last block of
275212397c6Schristos the output, and the gzip trailer is written. crc and tot maintains the
276212397c6Schristos crc and length (modulo 2^32) of the output for the trailer. The resulting
277212397c6Schristos gzip file is written to out. gzinit() must be called before the first call
278212397c6Schristos of gzcopy() to write the gzip header and to initialize crc and tot. */
gzcopy(char * name,int clr,unsigned long * crc,unsigned long * tot,FILE * out)279212397c6Schristos local void gzcopy(char *name, int clr, unsigned long *crc, unsigned long *tot,
280212397c6Schristos FILE *out)
281212397c6Schristos {
282212397c6Schristos int ret; /* return value from zlib functions */
283212397c6Schristos int pos; /* where the "last block" bit is in byte */
284212397c6Schristos int last; /* true if processing the last block */
285212397c6Schristos bin *in; /* buffered input file */
286212397c6Schristos unsigned char *start; /* start of compressed data in buffer */
287212397c6Schristos unsigned char *junk; /* buffer for uncompressed data -- discarded */
288212397c6Schristos z_off_t len; /* length of uncompressed data (support > 4 GB) */
289212397c6Schristos z_stream strm; /* zlib inflate stream */
290212397c6Schristos
291212397c6Schristos /* open gzip file and skip header */
292212397c6Schristos in = bopen(name);
293212397c6Schristos if (in == NULL)
294212397c6Schristos bail("could not open ", name);
295212397c6Schristos gzhead(in);
296212397c6Schristos
297212397c6Schristos /* allocate buffer for uncompressed data and initialize raw inflate
298212397c6Schristos stream */
299212397c6Schristos junk = malloc(CHUNK);
300212397c6Schristos strm.zalloc = Z_NULL;
301212397c6Schristos strm.zfree = Z_NULL;
302212397c6Schristos strm.opaque = Z_NULL;
303212397c6Schristos strm.avail_in = 0;
304212397c6Schristos strm.next_in = Z_NULL;
305212397c6Schristos ret = inflateInit2(&strm, -15);
306212397c6Schristos if (junk == NULL || ret != Z_OK)
307212397c6Schristos bail("out of memory", "");
308212397c6Schristos
309212397c6Schristos /* inflate and copy compressed data, clear last-block bit if requested */
310212397c6Schristos len = 0;
311212397c6Schristos zpull(&strm, in);
312*ba340e45Schristos start = in->next;
313212397c6Schristos last = start[0] & 1;
314212397c6Schristos if (last && clr)
315212397c6Schristos start[0] &= ~1;
316212397c6Schristos strm.avail_out = 0;
317212397c6Schristos for (;;) {
318212397c6Schristos /* if input used and output done, write used input and get more */
319212397c6Schristos if (strm.avail_in == 0 && strm.avail_out != 0) {
320212397c6Schristos fwrite(start, 1, strm.next_in - start, out);
321212397c6Schristos start = in->buf;
322212397c6Schristos in->left = 0;
323212397c6Schristos zpull(&strm, in);
324212397c6Schristos }
325212397c6Schristos
326212397c6Schristos /* decompress -- return early when end-of-block reached */
327212397c6Schristos strm.avail_out = CHUNK;
328212397c6Schristos strm.next_out = junk;
329212397c6Schristos ret = inflate(&strm, Z_BLOCK);
330212397c6Schristos switch (ret) {
331212397c6Schristos case Z_MEM_ERROR:
332212397c6Schristos bail("out of memory", "");
333212397c6Schristos case Z_DATA_ERROR:
334212397c6Schristos bail("invalid compressed data in ", in->name);
335212397c6Schristos }
336212397c6Schristos
337212397c6Schristos /* update length of uncompressed data */
338212397c6Schristos len += CHUNK - strm.avail_out;
339212397c6Schristos
340212397c6Schristos /* check for block boundary (only get this when block copied out) */
341212397c6Schristos if (strm.data_type & 128) {
342212397c6Schristos /* if that was the last block, then done */
343212397c6Schristos if (last)
344212397c6Schristos break;
345212397c6Schristos
346212397c6Schristos /* number of unused bits in last byte */
347212397c6Schristos pos = strm.data_type & 7;
348212397c6Schristos
349212397c6Schristos /* find the next last-block bit */
350212397c6Schristos if (pos != 0) {
351212397c6Schristos /* next last-block bit is in last used byte */
352212397c6Schristos pos = 0x100 >> pos;
353212397c6Schristos last = strm.next_in[-1] & pos;
354212397c6Schristos if (last && clr)
355*ba340e45Schristos in->buf[strm.next_in - in->buf - 1] &= ~pos;
356212397c6Schristos }
357212397c6Schristos else {
358212397c6Schristos /* next last-block bit is in next unused byte */
359212397c6Schristos if (strm.avail_in == 0) {
360212397c6Schristos /* don't have that byte yet -- get it */
361212397c6Schristos fwrite(start, 1, strm.next_in - start, out);
362212397c6Schristos start = in->buf;
363212397c6Schristos in->left = 0;
364212397c6Schristos zpull(&strm, in);
365212397c6Schristos }
366212397c6Schristos last = strm.next_in[0] & 1;
367212397c6Schristos if (last && clr)
368*ba340e45Schristos in->buf[strm.next_in - in->buf] &= ~1;
369212397c6Schristos }
370212397c6Schristos }
371212397c6Schristos }
372212397c6Schristos
373212397c6Schristos /* update buffer with unused input */
374212397c6Schristos in->left = strm.avail_in;
375*ba340e45Schristos in->next = in->buf + (strm.next_in - in->buf);
376212397c6Schristos
377212397c6Schristos /* copy used input, write empty blocks to get to byte boundary */
378212397c6Schristos pos = strm.data_type & 7;
379212397c6Schristos fwrite(start, 1, in->next - start - 1, out);
380212397c6Schristos last = in->next[-1];
381212397c6Schristos if (pos == 0 || !clr)
382212397c6Schristos /* already at byte boundary, or last file: write last byte */
383212397c6Schristos putc(last, out);
384212397c6Schristos else {
385212397c6Schristos /* append empty blocks to last byte */
386212397c6Schristos last &= ((0x100 >> pos) - 1); /* assure unused bits are zero */
387212397c6Schristos if (pos & 1) {
388212397c6Schristos /* odd -- append an empty stored block */
389212397c6Schristos putc(last, out);
390212397c6Schristos if (pos == 1)
391212397c6Schristos putc(0, out); /* two more bits in block header */
392212397c6Schristos fwrite("\0\0\xff\xff", 1, 4, out);
393212397c6Schristos }
394212397c6Schristos else {
395212397c6Schristos /* even -- append 1, 2, or 3 empty fixed blocks */
396212397c6Schristos switch (pos) {
397212397c6Schristos case 6:
398212397c6Schristos putc(last | 8, out);
399212397c6Schristos last = 0;
400212397c6Schristos case 4:
401212397c6Schristos putc(last | 0x20, out);
402212397c6Schristos last = 0;
403212397c6Schristos case 2:
404212397c6Schristos putc(last | 0x80, out);
405212397c6Schristos putc(0, out);
406212397c6Schristos }
407212397c6Schristos }
408212397c6Schristos }
409212397c6Schristos
410212397c6Schristos /* update crc and tot */
411212397c6Schristos *crc = crc32_combine(*crc, bget4(in), len);
412212397c6Schristos *tot += (unsigned long)len;
413212397c6Schristos
414212397c6Schristos /* clean up */
415212397c6Schristos inflateEnd(&strm);
416212397c6Schristos free(junk);
417212397c6Schristos bclose(in);
418212397c6Schristos
419212397c6Schristos /* write trailer if this is the last gzip file */
420212397c6Schristos if (!clr) {
421212397c6Schristos put4(*crc, out);
422212397c6Schristos put4(*tot, out);
423212397c6Schristos }
424212397c6Schristos }
425212397c6Schristos
426212397c6Schristos /* join the gzip files on the command line, write result to stdout */
main(int argc,char ** argv)427212397c6Schristos int main(int argc, char **argv)
428212397c6Schristos {
429212397c6Schristos unsigned long crc, tot; /* running crc and total uncompressed length */
430212397c6Schristos
431212397c6Schristos /* skip command name */
432212397c6Schristos argc--;
433212397c6Schristos argv++;
434212397c6Schristos
435212397c6Schristos /* show usage if no arguments */
436212397c6Schristos if (argc == 0) {
437212397c6Schristos fputs("gzjoin usage: gzjoin f1.gz [f2.gz [f3.gz ...]] > fjoin.gz\n",
438212397c6Schristos stderr);
439212397c6Schristos return 0;
440212397c6Schristos }
441212397c6Schristos
442212397c6Schristos /* join gzip files on command line and write to stdout */
443212397c6Schristos gzinit(&crc, &tot, stdout);
444212397c6Schristos while (argc--)
445212397c6Schristos gzcopy(*argv++, argc, &crc, &tot, stdout);
446212397c6Schristos
447212397c6Schristos /* done */
448212397c6Schristos return 0;
449212397c6Schristos }
450