14e00368fSchristos /* gzjoin -- command to join gzip files into one gzip file
24e00368fSchristos
3*ed8eb4c2Schristos Copyright (C) 2004, 2005, 2012 Mark Adler, all rights reserved
4*ed8eb4c2Schristos version 1.2, 14 Aug 2012
54e00368fSchristos
64e00368fSchristos This software is provided 'as-is', without any express or implied
74e00368fSchristos warranty. In no event will the author be held liable for any damages
84e00368fSchristos arising from the use of this software.
94e00368fSchristos
104e00368fSchristos Permission is granted to anyone to use this software for any purpose,
114e00368fSchristos including commercial applications, and to alter it and redistribute it
124e00368fSchristos freely, subject to the following restrictions:
134e00368fSchristos
144e00368fSchristos 1. The origin of this software must not be misrepresented; you must not
154e00368fSchristos claim that you wrote the original software. If you use this software
164e00368fSchristos in a product, an acknowledgment in the product documentation would be
174e00368fSchristos appreciated but is not required.
184e00368fSchristos 2. Altered source versions must be plainly marked as such, and must not be
194e00368fSchristos misrepresented as being the original software.
204e00368fSchristos 3. This notice may not be removed or altered from any source distribution.
214e00368fSchristos
224e00368fSchristos Mark Adler madler@alumni.caltech.edu
234e00368fSchristos */
244e00368fSchristos
254e00368fSchristos /*
264e00368fSchristos * Change history:
274e00368fSchristos *
284e00368fSchristos * 1.0 11 Dec 2004 - First version
294e00368fSchristos * 1.1 12 Jun 2005 - Changed ssize_t to long for portability
30*ed8eb4c2Schristos * 1.2 14 Aug 2012 - Clean up for z_const usage
314e00368fSchristos */
324e00368fSchristos
334e00368fSchristos /*
344e00368fSchristos gzjoin takes one or more gzip files on the command line and writes out a
354e00368fSchristos single gzip file that will uncompress to the concatenation of the
364e00368fSchristos uncompressed data from the individual gzip files. gzjoin does this without
374e00368fSchristos having to recompress any of the data and without having to calculate a new
384e00368fSchristos crc32 for the concatenated uncompressed data. gzjoin does however have to
394e00368fSchristos decompress all of the input data in order to find the bits in the compressed
404e00368fSchristos data that need to be modified to concatenate the streams.
414e00368fSchristos
424e00368fSchristos gzjoin does not do an integrity check on the input gzip files other than
434e00368fSchristos checking the gzip header and decompressing the compressed data. They are
444e00368fSchristos otherwise assumed to be complete and correct.
454e00368fSchristos
464e00368fSchristos Each joint between gzip files removes at least 18 bytes of previous trailer
474e00368fSchristos and subsequent header, and inserts an average of about three bytes to the
484e00368fSchristos compressed data in order to connect the streams. The output gzip file
494e00368fSchristos has a minimal ten-byte gzip header with no file name or modification time.
504e00368fSchristos
514e00368fSchristos This program was written to illustrate the use of the Z_BLOCK option of
524e00368fSchristos inflate() and the crc32_combine() function. gzjoin will not compile with
534e00368fSchristos versions of zlib earlier than 1.2.3.
544e00368fSchristos */
554e00368fSchristos
564e00368fSchristos #include <stdio.h> /* fputs(), fprintf(), fwrite(), putc() */
574e00368fSchristos #include <stdlib.h> /* exit(), malloc(), free() */
584e00368fSchristos #include <fcntl.h> /* open() */
594e00368fSchristos #include <unistd.h> /* close(), read(), lseek() */
604e00368fSchristos #include "zlib.h"
614e00368fSchristos /* crc32(), crc32_combine(), inflateInit2(), inflate(), inflateEnd() */
624e00368fSchristos
634e00368fSchristos #define local static
644e00368fSchristos
654e00368fSchristos /* exit with an error (return a value to allow use in an expression) */
bail(char * why1,char * why2)664e00368fSchristos local int bail(char *why1, char *why2)
674e00368fSchristos {
684e00368fSchristos fprintf(stderr, "gzjoin error: %s%s, output incomplete\n", why1, why2);
694e00368fSchristos exit(1);
704e00368fSchristos return 0;
714e00368fSchristos }
724e00368fSchristos
734e00368fSchristos /* -- simple buffered file input with access to the buffer -- */
744e00368fSchristos
754e00368fSchristos #define CHUNK 32768 /* must be a power of two and fit in unsigned */
764e00368fSchristos
774e00368fSchristos /* bin buffered input file type */
784e00368fSchristos typedef struct {
794e00368fSchristos char *name; /* name of file for error messages */
804e00368fSchristos int fd; /* file descriptor */
814e00368fSchristos unsigned left; /* bytes remaining at next */
824e00368fSchristos unsigned char *next; /* next byte to read */
834e00368fSchristos unsigned char *buf; /* allocated buffer of length CHUNK */
844e00368fSchristos } bin;
854e00368fSchristos
864e00368fSchristos /* close a buffered file and free allocated memory */
bclose(bin * in)874e00368fSchristos local void bclose(bin *in)
884e00368fSchristos {
894e00368fSchristos if (in != NULL) {
904e00368fSchristos if (in->fd != -1)
914e00368fSchristos close(in->fd);
924e00368fSchristos if (in->buf != NULL)
934e00368fSchristos free(in->buf);
944e00368fSchristos free(in);
954e00368fSchristos }
964e00368fSchristos }
974e00368fSchristos
984e00368fSchristos /* open a buffered file for input, return a pointer to type bin, or NULL on
994e00368fSchristos failure */
bopen(char * name)1004e00368fSchristos local bin *bopen(char *name)
1014e00368fSchristos {
1024e00368fSchristos bin *in;
1034e00368fSchristos
1044e00368fSchristos in = malloc(sizeof(bin));
1054e00368fSchristos if (in == NULL)
1064e00368fSchristos return NULL;
1074e00368fSchristos in->buf = malloc(CHUNK);
1084e00368fSchristos in->fd = open(name, O_RDONLY, 0);
1094e00368fSchristos if (in->buf == NULL || in->fd == -1) {
1104e00368fSchristos bclose(in);
1114e00368fSchristos return NULL;
1124e00368fSchristos }
1134e00368fSchristos in->left = 0;
1144e00368fSchristos in->next = in->buf;
1154e00368fSchristos in->name = name;
1164e00368fSchristos return in;
1174e00368fSchristos }
1184e00368fSchristos
1194e00368fSchristos /* load buffer from file, return -1 on read error, 0 or 1 on success, with
1204e00368fSchristos 1 indicating that end-of-file was reached */
bload(bin * in)1214e00368fSchristos local int bload(bin *in)
1224e00368fSchristos {
1234e00368fSchristos long len;
1244e00368fSchristos
1254e00368fSchristos if (in == NULL)
1264e00368fSchristos return -1;
1274e00368fSchristos if (in->left != 0)
1284e00368fSchristos return 0;
1294e00368fSchristos in->next = in->buf;
1304e00368fSchristos do {
1314e00368fSchristos len = (long)read(in->fd, in->buf + in->left, CHUNK - in->left);
1324e00368fSchristos if (len < 0)
1334e00368fSchristos return -1;
1344e00368fSchristos in->left += (unsigned)len;
1354e00368fSchristos } while (len != 0 && in->left < CHUNK);
1364e00368fSchristos return len == 0 ? 1 : 0;
1374e00368fSchristos }
1384e00368fSchristos
1394e00368fSchristos /* get a byte from the file, bail if end of file */
1404e00368fSchristos #define bget(in) (in->left ? 0 : bload(in), \
1414e00368fSchristos in->left ? (in->left--, *(in->next)++) : \
1424e00368fSchristos bail("unexpected end of file on ", in->name))
1434e00368fSchristos
1444e00368fSchristos /* get a four-byte little-endian unsigned integer from file */
bget4(bin * in)1454e00368fSchristos local unsigned long bget4(bin *in)
1464e00368fSchristos {
1474e00368fSchristos unsigned long val;
1484e00368fSchristos
1494e00368fSchristos val = bget(in);
1504e00368fSchristos val += (unsigned long)(bget(in)) << 8;
1514e00368fSchristos val += (unsigned long)(bget(in)) << 16;
1524e00368fSchristos val += (unsigned long)(bget(in)) << 24;
1534e00368fSchristos return val;
1544e00368fSchristos }
1554e00368fSchristos
1564e00368fSchristos /* skip bytes in file */
bskip(bin * in,unsigned skip)1574e00368fSchristos local void bskip(bin *in, unsigned skip)
1584e00368fSchristos {
1594e00368fSchristos /* check pointer */
1604e00368fSchristos if (in == NULL)
1614e00368fSchristos return;
1624e00368fSchristos
1634e00368fSchristos /* easy case -- skip bytes in buffer */
1644e00368fSchristos if (skip <= in->left) {
1654e00368fSchristos in->left -= skip;
1664e00368fSchristos in->next += skip;
1674e00368fSchristos return;
1684e00368fSchristos }
1694e00368fSchristos
1704e00368fSchristos /* skip what's in buffer, discard buffer contents */
1714e00368fSchristos skip -= in->left;
1724e00368fSchristos in->left = 0;
1734e00368fSchristos
1744e00368fSchristos /* seek past multiples of CHUNK bytes */
1754e00368fSchristos if (skip > CHUNK) {
1764e00368fSchristos unsigned left;
1774e00368fSchristos
1784e00368fSchristos left = skip & (CHUNK - 1);
1794e00368fSchristos if (left == 0) {
1804e00368fSchristos /* exact number of chunks: seek all the way minus one byte to check
1814e00368fSchristos for end-of-file with a read */
1824e00368fSchristos lseek(in->fd, skip - 1, SEEK_CUR);
1834e00368fSchristos if (read(in->fd, in->buf, 1) != 1)
1844e00368fSchristos bail("unexpected end of file on ", in->name);
1854e00368fSchristos return;
1864e00368fSchristos }
1874e00368fSchristos
1884e00368fSchristos /* skip the integral chunks, update skip with remainder */
1894e00368fSchristos lseek(in->fd, skip - left, SEEK_CUR);
1904e00368fSchristos skip = left;
1914e00368fSchristos }
1924e00368fSchristos
1934e00368fSchristos /* read more input and skip remainder */
1944e00368fSchristos bload(in);
1954e00368fSchristos if (skip > in->left)
1964e00368fSchristos bail("unexpected end of file on ", in->name);
1974e00368fSchristos in->left -= skip;
1984e00368fSchristos in->next += skip;
1994e00368fSchristos }
2004e00368fSchristos
2014e00368fSchristos /* -- end of buffered input functions -- */
2024e00368fSchristos
2034e00368fSchristos /* skip the gzip header from file in */
gzhead(bin * in)2044e00368fSchristos local void gzhead(bin *in)
2054e00368fSchristos {
2064e00368fSchristos int flags;
2074e00368fSchristos
2084e00368fSchristos /* verify gzip magic header and compression method */
2094e00368fSchristos if (bget(in) != 0x1f || bget(in) != 0x8b || bget(in) != 8)
2104e00368fSchristos bail(in->name, " is not a valid gzip file");
2114e00368fSchristos
2124e00368fSchristos /* get and verify flags */
2134e00368fSchristos flags = bget(in);
2144e00368fSchristos if ((flags & 0xe0) != 0)
2154e00368fSchristos bail("unknown reserved bits set in ", in->name);
2164e00368fSchristos
2174e00368fSchristos /* skip modification time, extra flags, and os */
2184e00368fSchristos bskip(in, 6);
2194e00368fSchristos
2204e00368fSchristos /* skip extra field if present */
2214e00368fSchristos if (flags & 4) {
2224e00368fSchristos unsigned len;
2234e00368fSchristos
2244e00368fSchristos len = bget(in);
2254e00368fSchristos len += (unsigned)(bget(in)) << 8;
2264e00368fSchristos bskip(in, len);
2274e00368fSchristos }
2284e00368fSchristos
2294e00368fSchristos /* skip file name if present */
2304e00368fSchristos if (flags & 8)
2314e00368fSchristos while (bget(in) != 0)
2324e00368fSchristos ;
2334e00368fSchristos
2344e00368fSchristos /* skip comment if present */
2354e00368fSchristos if (flags & 16)
2364e00368fSchristos while (bget(in) != 0)
2374e00368fSchristos ;
2384e00368fSchristos
2394e00368fSchristos /* skip header crc if present */
2404e00368fSchristos if (flags & 2)
2414e00368fSchristos bskip(in, 2);
2424e00368fSchristos }
2434e00368fSchristos
2444e00368fSchristos /* write a four-byte little-endian unsigned integer to out */
put4(unsigned long val,FILE * out)2454e00368fSchristos local void put4(unsigned long val, FILE *out)
2464e00368fSchristos {
2474e00368fSchristos putc(val & 0xff, out);
2484e00368fSchristos putc((val >> 8) & 0xff, out);
2494e00368fSchristos putc((val >> 16) & 0xff, out);
2504e00368fSchristos putc((val >> 24) & 0xff, out);
2514e00368fSchristos }
2524e00368fSchristos
2534e00368fSchristos /* Load up zlib stream from buffered input, bail if end of file */
zpull(z_streamp strm,bin * in)2544e00368fSchristos local void zpull(z_streamp strm, bin *in)
2554e00368fSchristos {
2564e00368fSchristos if (in->left == 0)
2574e00368fSchristos bload(in);
2584e00368fSchristos if (in->left == 0)
2594e00368fSchristos bail("unexpected end of file on ", in->name);
2604e00368fSchristos strm->avail_in = in->left;
2614e00368fSchristos strm->next_in = in->next;
2624e00368fSchristos }
2634e00368fSchristos
2644e00368fSchristos /* Write header for gzip file to out and initialize trailer. */
gzinit(unsigned long * crc,unsigned long * tot,FILE * out)2654e00368fSchristos local void gzinit(unsigned long *crc, unsigned long *tot, FILE *out)
2664e00368fSchristos {
2674e00368fSchristos fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out);
2684e00368fSchristos *crc = crc32(0L, Z_NULL, 0);
2694e00368fSchristos *tot = 0;
2704e00368fSchristos }
2714e00368fSchristos
2724e00368fSchristos /* Copy the compressed data from name, zeroing the last block bit of the last
2734e00368fSchristos block if clr is true, and adding empty blocks as needed to get to a byte
2744e00368fSchristos boundary. If clr is false, then the last block becomes the last block of
2754e00368fSchristos the output, and the gzip trailer is written. crc and tot maintains the
2764e00368fSchristos crc and length (modulo 2^32) of the output for the trailer. The resulting
2774e00368fSchristos gzip file is written to out. gzinit() must be called before the first call
2784e00368fSchristos of gzcopy() to write the gzip header and to initialize crc and tot. */
gzcopy(char * name,int clr,unsigned long * crc,unsigned long * tot,FILE * out)2794e00368fSchristos local void gzcopy(char *name, int clr, unsigned long *crc, unsigned long *tot,
2804e00368fSchristos FILE *out)
2814e00368fSchristos {
2824e00368fSchristos int ret; /* return value from zlib functions */
2834e00368fSchristos int pos; /* where the "last block" bit is in byte */
2844e00368fSchristos int last; /* true if processing the last block */
2854e00368fSchristos bin *in; /* buffered input file */
2864e00368fSchristos unsigned char *start; /* start of compressed data in buffer */
2874e00368fSchristos unsigned char *junk; /* buffer for uncompressed data -- discarded */
2884e00368fSchristos z_off_t len; /* length of uncompressed data (support > 4 GB) */
2894e00368fSchristos z_stream strm; /* zlib inflate stream */
2904e00368fSchristos
2914e00368fSchristos /* open gzip file and skip header */
2924e00368fSchristos in = bopen(name);
2934e00368fSchristos if (in == NULL)
2944e00368fSchristos bail("could not open ", name);
2954e00368fSchristos gzhead(in);
2964e00368fSchristos
2974e00368fSchristos /* allocate buffer for uncompressed data and initialize raw inflate
2984e00368fSchristos stream */
2994e00368fSchristos junk = malloc(CHUNK);
3004e00368fSchristos strm.zalloc = Z_NULL;
3014e00368fSchristos strm.zfree = Z_NULL;
3024e00368fSchristos strm.opaque = Z_NULL;
3034e00368fSchristos strm.avail_in = 0;
3044e00368fSchristos strm.next_in = Z_NULL;
3054e00368fSchristos ret = inflateInit2(&strm, -15);
3064e00368fSchristos if (junk == NULL || ret != Z_OK)
3074e00368fSchristos bail("out of memory", "");
3084e00368fSchristos
3094e00368fSchristos /* inflate and copy compressed data, clear last-block bit if requested */
3104e00368fSchristos len = 0;
3114e00368fSchristos zpull(&strm, in);
312*ed8eb4c2Schristos start = in->next;
3134e00368fSchristos last = start[0] & 1;
3144e00368fSchristos if (last && clr)
3154e00368fSchristos start[0] &= ~1;
3164e00368fSchristos strm.avail_out = 0;
3174e00368fSchristos for (;;) {
3184e00368fSchristos /* if input used and output done, write used input and get more */
3194e00368fSchristos if (strm.avail_in == 0 && strm.avail_out != 0) {
3204e00368fSchristos fwrite(start, 1, strm.next_in - start, out);
3214e00368fSchristos start = in->buf;
3224e00368fSchristos in->left = 0;
3234e00368fSchristos zpull(&strm, in);
3244e00368fSchristos }
3254e00368fSchristos
3264e00368fSchristos /* decompress -- return early when end-of-block reached */
3274e00368fSchristos strm.avail_out = CHUNK;
3284e00368fSchristos strm.next_out = junk;
3294e00368fSchristos ret = inflate(&strm, Z_BLOCK);
3304e00368fSchristos switch (ret) {
3314e00368fSchristos case Z_MEM_ERROR:
3324e00368fSchristos bail("out of memory", "");
3334e00368fSchristos case Z_DATA_ERROR:
3344e00368fSchristos bail("invalid compressed data in ", in->name);
3354e00368fSchristos }
3364e00368fSchristos
3374e00368fSchristos /* update length of uncompressed data */
3384e00368fSchristos len += CHUNK - strm.avail_out;
3394e00368fSchristos
3404e00368fSchristos /* check for block boundary (only get this when block copied out) */
3414e00368fSchristos if (strm.data_type & 128) {
3424e00368fSchristos /* if that was the last block, then done */
3434e00368fSchristos if (last)
3444e00368fSchristos break;
3454e00368fSchristos
3464e00368fSchristos /* number of unused bits in last byte */
3474e00368fSchristos pos = strm.data_type & 7;
3484e00368fSchristos
3494e00368fSchristos /* find the next last-block bit */
3504e00368fSchristos if (pos != 0) {
3514e00368fSchristos /* next last-block bit is in last used byte */
3524e00368fSchristos pos = 0x100 >> pos;
3534e00368fSchristos last = strm.next_in[-1] & pos;
3544e00368fSchristos if (last && clr)
355*ed8eb4c2Schristos in->buf[strm.next_in - in->buf - 1] &= ~pos;
3564e00368fSchristos }
3574e00368fSchristos else {
3584e00368fSchristos /* next last-block bit is in next unused byte */
3594e00368fSchristos if (strm.avail_in == 0) {
3604e00368fSchristos /* don't have that byte yet -- get it */
3614e00368fSchristos fwrite(start, 1, strm.next_in - start, out);
3624e00368fSchristos start = in->buf;
3634e00368fSchristos in->left = 0;
3644e00368fSchristos zpull(&strm, in);
3654e00368fSchristos }
3664e00368fSchristos last = strm.next_in[0] & 1;
3674e00368fSchristos if (last && clr)
368*ed8eb4c2Schristos in->buf[strm.next_in - in->buf] &= ~1;
3694e00368fSchristos }
3704e00368fSchristos }
3714e00368fSchristos }
3724e00368fSchristos
3734e00368fSchristos /* update buffer with unused input */
3744e00368fSchristos in->left = strm.avail_in;
375*ed8eb4c2Schristos in->next = in->buf + (strm.next_in - in->buf);
3764e00368fSchristos
3774e00368fSchristos /* copy used input, write empty blocks to get to byte boundary */
3784e00368fSchristos pos = strm.data_type & 7;
3794e00368fSchristos fwrite(start, 1, in->next - start - 1, out);
3804e00368fSchristos last = in->next[-1];
3814e00368fSchristos if (pos == 0 || !clr)
3824e00368fSchristos /* already at byte boundary, or last file: write last byte */
3834e00368fSchristos putc(last, out);
3844e00368fSchristos else {
3854e00368fSchristos /* append empty blocks to last byte */
3864e00368fSchristos last &= ((0x100 >> pos) - 1); /* assure unused bits are zero */
3874e00368fSchristos if (pos & 1) {
3884e00368fSchristos /* odd -- append an empty stored block */
3894e00368fSchristos putc(last, out);
3904e00368fSchristos if (pos == 1)
3914e00368fSchristos putc(0, out); /* two more bits in block header */
3924e00368fSchristos fwrite("\0\0\xff\xff", 1, 4, out);
3934e00368fSchristos }
3944e00368fSchristos else {
3954e00368fSchristos /* even -- append 1, 2, or 3 empty fixed blocks */
3964e00368fSchristos switch (pos) {
3974e00368fSchristos case 6:
3984e00368fSchristos putc(last | 8, out);
3994e00368fSchristos last = 0;
4004e00368fSchristos case 4:
4014e00368fSchristos putc(last | 0x20, out);
4024e00368fSchristos last = 0;
4034e00368fSchristos case 2:
4044e00368fSchristos putc(last | 0x80, out);
4054e00368fSchristos putc(0, out);
4064e00368fSchristos }
4074e00368fSchristos }
4084e00368fSchristos }
4094e00368fSchristos
4104e00368fSchristos /* update crc and tot */
4114e00368fSchristos *crc = crc32_combine(*crc, bget4(in), len);
4124e00368fSchristos *tot += (unsigned long)len;
4134e00368fSchristos
4144e00368fSchristos /* clean up */
4154e00368fSchristos inflateEnd(&strm);
4164e00368fSchristos free(junk);
4174e00368fSchristos bclose(in);
4184e00368fSchristos
4194e00368fSchristos /* write trailer if this is the last gzip file */
4204e00368fSchristos if (!clr) {
4214e00368fSchristos put4(*crc, out);
4224e00368fSchristos put4(*tot, out);
4234e00368fSchristos }
4244e00368fSchristos }
4254e00368fSchristos
4264e00368fSchristos /* join the gzip files on the command line, write result to stdout */
main(int argc,char ** argv)4274e00368fSchristos int main(int argc, char **argv)
4284e00368fSchristos {
4294e00368fSchristos unsigned long crc, tot; /* running crc and total uncompressed length */
4304e00368fSchristos
4314e00368fSchristos /* skip command name */
4324e00368fSchristos argc--;
4334e00368fSchristos argv++;
4344e00368fSchristos
4354e00368fSchristos /* show usage if no arguments */
4364e00368fSchristos if (argc == 0) {
4374e00368fSchristos fputs("gzjoin usage: gzjoin f1.gz [f2.gz [f3.gz ...]] > fjoin.gz\n",
4384e00368fSchristos stderr);
4394e00368fSchristos return 0;
4404e00368fSchristos }
4414e00368fSchristos
4424e00368fSchristos /* join gzip files on command line and write to stdout */
4434e00368fSchristos gzinit(&crc, &tot, stdout);
4444e00368fSchristos while (argc--)
4454e00368fSchristos gzcopy(*argv++, argc, &crc, &tot, stdout);
4464e00368fSchristos
4474e00368fSchristos /* done */
4484e00368fSchristos return 0;
4494e00368fSchristos }
450