19573673dSchristos /* gzjoin -- command to join gzip files into one gzip file
29573673dSchristos
3*8cbf5cb7Schristos Copyright (C) 2004, 2005, 2012 Mark Adler, all rights reserved
4*8cbf5cb7Schristos version 1.2, 14 Aug 2012
59573673dSchristos
69573673dSchristos This software is provided 'as-is', without any express or implied
79573673dSchristos warranty. In no event will the author be held liable for any damages
89573673dSchristos arising from the use of this software.
99573673dSchristos
109573673dSchristos Permission is granted to anyone to use this software for any purpose,
119573673dSchristos including commercial applications, and to alter it and redistribute it
129573673dSchristos freely, subject to the following restrictions:
139573673dSchristos
149573673dSchristos 1. The origin of this software must not be misrepresented; you must not
159573673dSchristos claim that you wrote the original software. If you use this software
169573673dSchristos in a product, an acknowledgment in the product documentation would be
179573673dSchristos appreciated but is not required.
189573673dSchristos 2. Altered source versions must be plainly marked as such, and must not be
199573673dSchristos misrepresented as being the original software.
209573673dSchristos 3. This notice may not be removed or altered from any source distribution.
219573673dSchristos
229573673dSchristos Mark Adler madler@alumni.caltech.edu
239573673dSchristos */
249573673dSchristos
259573673dSchristos /*
269573673dSchristos * Change history:
279573673dSchristos *
289573673dSchristos * 1.0 11 Dec 2004 - First version
299573673dSchristos * 1.1 12 Jun 2005 - Changed ssize_t to long for portability
30*8cbf5cb7Schristos * 1.2 14 Aug 2012 - Clean up for z_const usage
319573673dSchristos */
329573673dSchristos
339573673dSchristos /*
349573673dSchristos gzjoin takes one or more gzip files on the command line and writes out a
359573673dSchristos single gzip file that will uncompress to the concatenation of the
369573673dSchristos uncompressed data from the individual gzip files. gzjoin does this without
379573673dSchristos having to recompress any of the data and without having to calculate a new
389573673dSchristos crc32 for the concatenated uncompressed data. gzjoin does however have to
399573673dSchristos decompress all of the input data in order to find the bits in the compressed
409573673dSchristos data that need to be modified to concatenate the streams.
419573673dSchristos
429573673dSchristos gzjoin does not do an integrity check on the input gzip files other than
439573673dSchristos checking the gzip header and decompressing the compressed data. They are
449573673dSchristos otherwise assumed to be complete and correct.
459573673dSchristos
469573673dSchristos Each joint between gzip files removes at least 18 bytes of previous trailer
479573673dSchristos and subsequent header, and inserts an average of about three bytes to the
489573673dSchristos compressed data in order to connect the streams. The output gzip file
499573673dSchristos has a minimal ten-byte gzip header with no file name or modification time.
509573673dSchristos
519573673dSchristos This program was written to illustrate the use of the Z_BLOCK option of
529573673dSchristos inflate() and the crc32_combine() function. gzjoin will not compile with
539573673dSchristos versions of zlib earlier than 1.2.3.
549573673dSchristos */
559573673dSchristos
569573673dSchristos #include <stdio.h> /* fputs(), fprintf(), fwrite(), putc() */
579573673dSchristos #include <stdlib.h> /* exit(), malloc(), free() */
589573673dSchristos #include <fcntl.h> /* open() */
599573673dSchristos #include <unistd.h> /* close(), read(), lseek() */
609573673dSchristos #include "zlib.h"
619573673dSchristos /* crc32(), crc32_combine(), inflateInit2(), inflate(), inflateEnd() */
629573673dSchristos
639573673dSchristos #define local static
649573673dSchristos
659573673dSchristos /* exit with an error (return a value to allow use in an expression) */
bail(char * why1,char * why2)669573673dSchristos local int bail(char *why1, char *why2)
679573673dSchristos {
689573673dSchristos fprintf(stderr, "gzjoin error: %s%s, output incomplete\n", why1, why2);
699573673dSchristos exit(1);
709573673dSchristos return 0;
719573673dSchristos }
729573673dSchristos
739573673dSchristos /* -- simple buffered file input with access to the buffer -- */
749573673dSchristos
759573673dSchristos #define CHUNK 32768 /* must be a power of two and fit in unsigned */
769573673dSchristos
779573673dSchristos /* bin buffered input file type */
789573673dSchristos typedef struct {
799573673dSchristos char *name; /* name of file for error messages */
809573673dSchristos int fd; /* file descriptor */
819573673dSchristos unsigned left; /* bytes remaining at next */
829573673dSchristos unsigned char *next; /* next byte to read */
839573673dSchristos unsigned char *buf; /* allocated buffer of length CHUNK */
849573673dSchristos } bin;
859573673dSchristos
869573673dSchristos /* close a buffered file and free allocated memory */
bclose(bin * in)879573673dSchristos local void bclose(bin *in)
889573673dSchristos {
899573673dSchristos if (in != NULL) {
909573673dSchristos if (in->fd != -1)
919573673dSchristos close(in->fd);
929573673dSchristos if (in->buf != NULL)
939573673dSchristos free(in->buf);
949573673dSchristos free(in);
959573673dSchristos }
969573673dSchristos }
979573673dSchristos
989573673dSchristos /* open a buffered file for input, return a pointer to type bin, or NULL on
999573673dSchristos failure */
bopen(char * name)1009573673dSchristos local bin *bopen(char *name)
1019573673dSchristos {
1029573673dSchristos bin *in;
1039573673dSchristos
1049573673dSchristos in = malloc(sizeof(bin));
1059573673dSchristos if (in == NULL)
1069573673dSchristos return NULL;
1079573673dSchristos in->buf = malloc(CHUNK);
1089573673dSchristos in->fd = open(name, O_RDONLY, 0);
1099573673dSchristos if (in->buf == NULL || in->fd == -1) {
1109573673dSchristos bclose(in);
1119573673dSchristos return NULL;
1129573673dSchristos }
1139573673dSchristos in->left = 0;
1149573673dSchristos in->next = in->buf;
1159573673dSchristos in->name = name;
1169573673dSchristos return in;
1179573673dSchristos }
1189573673dSchristos
1199573673dSchristos /* load buffer from file, return -1 on read error, 0 or 1 on success, with
1209573673dSchristos 1 indicating that end-of-file was reached */
bload(bin * in)1219573673dSchristos local int bload(bin *in)
1229573673dSchristos {
1239573673dSchristos long len;
1249573673dSchristos
1259573673dSchristos if (in == NULL)
1269573673dSchristos return -1;
1279573673dSchristos if (in->left != 0)
1289573673dSchristos return 0;
1299573673dSchristos in->next = in->buf;
1309573673dSchristos do {
1319573673dSchristos len = (long)read(in->fd, in->buf + in->left, CHUNK - in->left);
1329573673dSchristos if (len < 0)
1339573673dSchristos return -1;
1349573673dSchristos in->left += (unsigned)len;
1359573673dSchristos } while (len != 0 && in->left < CHUNK);
1369573673dSchristos return len == 0 ? 1 : 0;
1379573673dSchristos }
1389573673dSchristos
1399573673dSchristos /* get a byte from the file, bail if end of file */
1409573673dSchristos #define bget(in) (in->left ? 0 : bload(in), \
1419573673dSchristos in->left ? (in->left--, *(in->next)++) : \
1429573673dSchristos bail("unexpected end of file on ", in->name))
1439573673dSchristos
1449573673dSchristos /* get a four-byte little-endian unsigned integer from file */
bget4(bin * in)1459573673dSchristos local unsigned long bget4(bin *in)
1469573673dSchristos {
1479573673dSchristos unsigned long val;
1489573673dSchristos
1499573673dSchristos val = bget(in);
1509573673dSchristos val += (unsigned long)(bget(in)) << 8;
1519573673dSchristos val += (unsigned long)(bget(in)) << 16;
1529573673dSchristos val += (unsigned long)(bget(in)) << 24;
1539573673dSchristos return val;
1549573673dSchristos }
1559573673dSchristos
1569573673dSchristos /* skip bytes in file */
bskip(bin * in,unsigned skip)1579573673dSchristos local void bskip(bin *in, unsigned skip)
1589573673dSchristos {
1599573673dSchristos /* check pointer */
1609573673dSchristos if (in == NULL)
1619573673dSchristos return;
1629573673dSchristos
1639573673dSchristos /* easy case -- skip bytes in buffer */
1649573673dSchristos if (skip <= in->left) {
1659573673dSchristos in->left -= skip;
1669573673dSchristos in->next += skip;
1679573673dSchristos return;
1689573673dSchristos }
1699573673dSchristos
1709573673dSchristos /* skip what's in buffer, discard buffer contents */
1719573673dSchristos skip -= in->left;
1729573673dSchristos in->left = 0;
1739573673dSchristos
1749573673dSchristos /* seek past multiples of CHUNK bytes */
1759573673dSchristos if (skip > CHUNK) {
1769573673dSchristos unsigned left;
1779573673dSchristos
1789573673dSchristos left = skip & (CHUNK - 1);
1799573673dSchristos if (left == 0) {
1809573673dSchristos /* exact number of chunks: seek all the way minus one byte to check
1819573673dSchristos for end-of-file with a read */
1829573673dSchristos lseek(in->fd, skip - 1, SEEK_CUR);
1839573673dSchristos if (read(in->fd, in->buf, 1) != 1)
1849573673dSchristos bail("unexpected end of file on ", in->name);
1859573673dSchristos return;
1869573673dSchristos }
1879573673dSchristos
1889573673dSchristos /* skip the integral chunks, update skip with remainder */
1899573673dSchristos lseek(in->fd, skip - left, SEEK_CUR);
1909573673dSchristos skip = left;
1919573673dSchristos }
1929573673dSchristos
1939573673dSchristos /* read more input and skip remainder */
1949573673dSchristos bload(in);
1959573673dSchristos if (skip > in->left)
1969573673dSchristos bail("unexpected end of file on ", in->name);
1979573673dSchristos in->left -= skip;
1989573673dSchristos in->next += skip;
1999573673dSchristos }
2009573673dSchristos
2019573673dSchristos /* -- end of buffered input functions -- */
2029573673dSchristos
2039573673dSchristos /* skip the gzip header from file in */
gzhead(bin * in)2049573673dSchristos local void gzhead(bin *in)
2059573673dSchristos {
2069573673dSchristos int flags;
2079573673dSchristos
2089573673dSchristos /* verify gzip magic header and compression method */
2099573673dSchristos if (bget(in) != 0x1f || bget(in) != 0x8b || bget(in) != 8)
2109573673dSchristos bail(in->name, " is not a valid gzip file");
2119573673dSchristos
2129573673dSchristos /* get and verify flags */
2139573673dSchristos flags = bget(in);
2149573673dSchristos if ((flags & 0xe0) != 0)
2159573673dSchristos bail("unknown reserved bits set in ", in->name);
2169573673dSchristos
2179573673dSchristos /* skip modification time, extra flags, and os */
2189573673dSchristos bskip(in, 6);
2199573673dSchristos
2209573673dSchristos /* skip extra field if present */
2219573673dSchristos if (flags & 4) {
2229573673dSchristos unsigned len;
2239573673dSchristos
2249573673dSchristos len = bget(in);
2259573673dSchristos len += (unsigned)(bget(in)) << 8;
2269573673dSchristos bskip(in, len);
2279573673dSchristos }
2289573673dSchristos
2299573673dSchristos /* skip file name if present */
2309573673dSchristos if (flags & 8)
2319573673dSchristos while (bget(in) != 0)
2329573673dSchristos ;
2339573673dSchristos
2349573673dSchristos /* skip comment if present */
2359573673dSchristos if (flags & 16)
2369573673dSchristos while (bget(in) != 0)
2379573673dSchristos ;
2389573673dSchristos
2399573673dSchristos /* skip header crc if present */
2409573673dSchristos if (flags & 2)
2419573673dSchristos bskip(in, 2);
2429573673dSchristos }
2439573673dSchristos
2449573673dSchristos /* write a four-byte little-endian unsigned integer to out */
put4(unsigned long val,FILE * out)2459573673dSchristos local void put4(unsigned long val, FILE *out)
2469573673dSchristos {
2479573673dSchristos putc(val & 0xff, out);
2489573673dSchristos putc((val >> 8) & 0xff, out);
2499573673dSchristos putc((val >> 16) & 0xff, out);
2509573673dSchristos putc((val >> 24) & 0xff, out);
2519573673dSchristos }
2529573673dSchristos
2539573673dSchristos /* Load up zlib stream from buffered input, bail if end of file */
zpull(z_streamp strm,bin * in)2549573673dSchristos local void zpull(z_streamp strm, bin *in)
2559573673dSchristos {
2569573673dSchristos if (in->left == 0)
2579573673dSchristos bload(in);
2589573673dSchristos if (in->left == 0)
2599573673dSchristos bail("unexpected end of file on ", in->name);
2609573673dSchristos strm->avail_in = in->left;
2619573673dSchristos strm->next_in = in->next;
2629573673dSchristos }
2639573673dSchristos
2649573673dSchristos /* Write header for gzip file to out and initialize trailer. */
gzinit(unsigned long * crc,unsigned long * tot,FILE * out)2659573673dSchristos local void gzinit(unsigned long *crc, unsigned long *tot, FILE *out)
2669573673dSchristos {
2679573673dSchristos fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out);
2689573673dSchristos *crc = crc32(0L, Z_NULL, 0);
2699573673dSchristos *tot = 0;
2709573673dSchristos }
2719573673dSchristos
2729573673dSchristos /* Copy the compressed data from name, zeroing the last block bit of the last
2739573673dSchristos block if clr is true, and adding empty blocks as needed to get to a byte
2749573673dSchristos boundary. If clr is false, then the last block becomes the last block of
2759573673dSchristos the output, and the gzip trailer is written. crc and tot maintains the
2769573673dSchristos crc and length (modulo 2^32) of the output for the trailer. The resulting
2779573673dSchristos gzip file is written to out. gzinit() must be called before the first call
2789573673dSchristos of gzcopy() to write the gzip header and to initialize crc and tot. */
gzcopy(char * name,int clr,unsigned long * crc,unsigned long * tot,FILE * out)2799573673dSchristos local void gzcopy(char *name, int clr, unsigned long *crc, unsigned long *tot,
2809573673dSchristos FILE *out)
2819573673dSchristos {
2829573673dSchristos int ret; /* return value from zlib functions */
2839573673dSchristos int pos; /* where the "last block" bit is in byte */
2849573673dSchristos int last; /* true if processing the last block */
2859573673dSchristos bin *in; /* buffered input file */
2869573673dSchristos unsigned char *start; /* start of compressed data in buffer */
2879573673dSchristos unsigned char *junk; /* buffer for uncompressed data -- discarded */
2889573673dSchristos z_off_t len; /* length of uncompressed data (support > 4 GB) */
2899573673dSchristos z_stream strm; /* zlib inflate stream */
2909573673dSchristos
2919573673dSchristos /* open gzip file and skip header */
2929573673dSchristos in = bopen(name);
2939573673dSchristos if (in == NULL)
2949573673dSchristos bail("could not open ", name);
2959573673dSchristos gzhead(in);
2969573673dSchristos
2979573673dSchristos /* allocate buffer for uncompressed data and initialize raw inflate
2989573673dSchristos stream */
2999573673dSchristos junk = malloc(CHUNK);
3009573673dSchristos strm.zalloc = Z_NULL;
3019573673dSchristos strm.zfree = Z_NULL;
3029573673dSchristos strm.opaque = Z_NULL;
3039573673dSchristos strm.avail_in = 0;
3049573673dSchristos strm.next_in = Z_NULL;
3059573673dSchristos ret = inflateInit2(&strm, -15);
3069573673dSchristos if (junk == NULL || ret != Z_OK)
3079573673dSchristos bail("out of memory", "");
3089573673dSchristos
3099573673dSchristos /* inflate and copy compressed data, clear last-block bit if requested */
3109573673dSchristos len = 0;
3119573673dSchristos zpull(&strm, in);
312*8cbf5cb7Schristos start = in->next;
3139573673dSchristos last = start[0] & 1;
3149573673dSchristos if (last && clr)
3159573673dSchristos start[0] &= ~1;
3169573673dSchristos strm.avail_out = 0;
3179573673dSchristos for (;;) {
3189573673dSchristos /* if input used and output done, write used input and get more */
3199573673dSchristos if (strm.avail_in == 0 && strm.avail_out != 0) {
3209573673dSchristos fwrite(start, 1, strm.next_in - start, out);
3219573673dSchristos start = in->buf;
3229573673dSchristos in->left = 0;
3239573673dSchristos zpull(&strm, in);
3249573673dSchristos }
3259573673dSchristos
3269573673dSchristos /* decompress -- return early when end-of-block reached */
3279573673dSchristos strm.avail_out = CHUNK;
3289573673dSchristos strm.next_out = junk;
3299573673dSchristos ret = inflate(&strm, Z_BLOCK);
3309573673dSchristos switch (ret) {
3319573673dSchristos case Z_MEM_ERROR:
3329573673dSchristos bail("out of memory", "");
3339573673dSchristos case Z_DATA_ERROR:
3349573673dSchristos bail("invalid compressed data in ", in->name);
3359573673dSchristos }
3369573673dSchristos
3379573673dSchristos /* update length of uncompressed data */
3389573673dSchristos len += CHUNK - strm.avail_out;
3399573673dSchristos
3409573673dSchristos /* check for block boundary (only get this when block copied out) */
3419573673dSchristos if (strm.data_type & 128) {
3429573673dSchristos /* if that was the last block, then done */
3439573673dSchristos if (last)
3449573673dSchristos break;
3459573673dSchristos
3469573673dSchristos /* number of unused bits in last byte */
3479573673dSchristos pos = strm.data_type & 7;
3489573673dSchristos
3499573673dSchristos /* find the next last-block bit */
3509573673dSchristos if (pos != 0) {
3519573673dSchristos /* next last-block bit is in last used byte */
3529573673dSchristos pos = 0x100 >> pos;
3539573673dSchristos last = strm.next_in[-1] & pos;
3549573673dSchristos if (last && clr)
355*8cbf5cb7Schristos in->buf[strm.next_in - in->buf - 1] &= ~pos;
3569573673dSchristos }
3579573673dSchristos else {
3589573673dSchristos /* next last-block bit is in next unused byte */
3599573673dSchristos if (strm.avail_in == 0) {
3609573673dSchristos /* don't have that byte yet -- get it */
3619573673dSchristos fwrite(start, 1, strm.next_in - start, out);
3629573673dSchristos start = in->buf;
3639573673dSchristos in->left = 0;
3649573673dSchristos zpull(&strm, in);
3659573673dSchristos }
3669573673dSchristos last = strm.next_in[0] & 1;
3679573673dSchristos if (last && clr)
368*8cbf5cb7Schristos in->buf[strm.next_in - in->buf] &= ~1;
3699573673dSchristos }
3709573673dSchristos }
3719573673dSchristos }
3729573673dSchristos
3739573673dSchristos /* update buffer with unused input */
3749573673dSchristos in->left = strm.avail_in;
375*8cbf5cb7Schristos in->next = in->buf + (strm.next_in - in->buf);
3769573673dSchristos
3779573673dSchristos /* copy used input, write empty blocks to get to byte boundary */
3789573673dSchristos pos = strm.data_type & 7;
3799573673dSchristos fwrite(start, 1, in->next - start - 1, out);
3809573673dSchristos last = in->next[-1];
3819573673dSchristos if (pos == 0 || !clr)
3829573673dSchristos /* already at byte boundary, or last file: write last byte */
3839573673dSchristos putc(last, out);
3849573673dSchristos else {
3859573673dSchristos /* append empty blocks to last byte */
3869573673dSchristos last &= ((0x100 >> pos) - 1); /* assure unused bits are zero */
3879573673dSchristos if (pos & 1) {
3889573673dSchristos /* odd -- append an empty stored block */
3899573673dSchristos putc(last, out);
3909573673dSchristos if (pos == 1)
3919573673dSchristos putc(0, out); /* two more bits in block header */
3929573673dSchristos fwrite("\0\0\xff\xff", 1, 4, out);
3939573673dSchristos }
3949573673dSchristos else {
3959573673dSchristos /* even -- append 1, 2, or 3 empty fixed blocks */
3969573673dSchristos switch (pos) {
3979573673dSchristos case 6:
3989573673dSchristos putc(last | 8, out);
3999573673dSchristos last = 0;
4009573673dSchristos case 4:
4019573673dSchristos putc(last | 0x20, out);
4029573673dSchristos last = 0;
4039573673dSchristos case 2:
4049573673dSchristos putc(last | 0x80, out);
4059573673dSchristos putc(0, out);
4069573673dSchristos }
4079573673dSchristos }
4089573673dSchristos }
4099573673dSchristos
4109573673dSchristos /* update crc and tot */
4119573673dSchristos *crc = crc32_combine(*crc, bget4(in), len);
4129573673dSchristos *tot += (unsigned long)len;
4139573673dSchristos
4149573673dSchristos /* clean up */
4159573673dSchristos inflateEnd(&strm);
4169573673dSchristos free(junk);
4179573673dSchristos bclose(in);
4189573673dSchristos
4199573673dSchristos /* write trailer if this is the last gzip file */
4209573673dSchristos if (!clr) {
4219573673dSchristos put4(*crc, out);
4229573673dSchristos put4(*tot, out);
4239573673dSchristos }
4249573673dSchristos }
4259573673dSchristos
4269573673dSchristos /* join the gzip files on the command line, write result to stdout */
main(int argc,char ** argv)4279573673dSchristos int main(int argc, char **argv)
4289573673dSchristos {
4299573673dSchristos unsigned long crc, tot; /* running crc and total uncompressed length */
4309573673dSchristos
4319573673dSchristos /* skip command name */
4329573673dSchristos argc--;
4339573673dSchristos argv++;
4349573673dSchristos
4359573673dSchristos /* show usage if no arguments */
4369573673dSchristos if (argc == 0) {
4379573673dSchristos fputs("gzjoin usage: gzjoin f1.gz [f2.gz [f3.gz ...]] > fjoin.gz\n",
4389573673dSchristos stderr);
4399573673dSchristos return 0;
4409573673dSchristos }
4419573673dSchristos
4429573673dSchristos /* join gzip files on command line and write to stdout */
4439573673dSchristos gzinit(&crc, &tot, stdout);
4449573673dSchristos while (argc--)
4459573673dSchristos gzcopy(*argv++, argc, &crc, &tot, stdout);
4469573673dSchristos
4479573673dSchristos /* done */
4489573673dSchristos return 0;
4499573673dSchristos }
450