450 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
		
		
			
		
	
	
			450 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
|   | /* gzjoin -- command to join gzip files into one gzip file
 | ||
|  | 
 | ||
|  |   Copyright (C) 2004, 2005, 2012 Mark Adler, all rights reserved | ||
|  |   version 1.2, 14 Aug 2012 | ||
|  | 
 | ||
|  |   This software is provided 'as-is', without any express or implied | ||
|  |   warranty.  In no event will the author be held liable for any damages | ||
|  |   arising from the use of this software. | ||
|  | 
 | ||
|  |   Permission is granted to anyone to use this software for any purpose, | ||
|  |   including commercial applications, and to alter it and redistribute it | ||
|  |   freely, subject to the following restrictions: | ||
|  | 
 | ||
|  |   1. The origin of this software must not be misrepresented; you must not | ||
|  |      claim that you wrote the original software. If you use this software | ||
|  |      in a product, an acknowledgment in the product documentation would be | ||
|  |      appreciated but is not required. | ||
|  |   2. Altered source versions must be plainly marked as such, and must not be | ||
|  |      misrepresented as being the original software. | ||
|  |   3. This notice may not be removed or altered from any source distribution. | ||
|  | 
 | ||
|  |   Mark Adler    madler@alumni.caltech.edu | ||
|  |  */ | ||
|  | 
 | ||
|  | /*
 | ||
|  |  * Change history: | ||
|  |  * | ||
|  |  * 1.0  11 Dec 2004     - First version | ||
|  |  * 1.1  12 Jun 2005     - Changed ssize_t to long for portability | ||
|  |  * 1.2  14 Aug 2012     - Clean up for z_const usage | ||
|  |  */ | ||
|  | 
 | ||
|  | /*
 | ||
|  |    gzjoin takes one or more gzip files on the command line and writes out a | ||
|  |    single gzip file that will uncompress to the concatenation of the | ||
|  |    uncompressed data from the individual gzip files.  gzjoin does this without | ||
|  |    having to recompress any of the data and without having to calculate a new | ||
|  |    crc32 for the concatenated uncompressed data.  gzjoin does however have to | ||
|  |    decompress all of the input data in order to find the bits in the compressed | ||
|  |    data that need to be modified to concatenate the streams. | ||
|  | 
 | ||
|  |    gzjoin does not do an integrity check on the input gzip files other than | ||
|  |    checking the gzip header and decompressing the compressed data.  They are | ||
|  |    otherwise assumed to be complete and correct. | ||
|  | 
 | ||
|  |    Each joint between gzip files removes at least 18 bytes of previous trailer | ||
|  |    and subsequent header, and inserts an average of about three bytes to the | ||
|  |    compressed data in order to connect the streams.  The output gzip file | ||
|  |    has a minimal ten-byte gzip header with no file name or modification time. | ||
|  | 
 | ||
|  |    This program was written to illustrate the use of the Z_BLOCK option of | ||
|  |    inflate() and the crc32_combine() function.  gzjoin will not compile with | ||
|  |    versions of zlib earlier than 1.2.3. | ||
|  |  */ | ||
|  | 
 | ||
|  | #include <stdio.h>      /* fputs(), fprintf(), fwrite(), putc() */
 | ||
|  | #include <stdlib.h>     /* exit(), malloc(), free() */
 | ||
|  | #include <fcntl.h>      /* open() */
 | ||
|  | #include <unistd.h>     /* close(), read(), lseek() */
 | ||
|  | #include "zlib.h"
 | ||
|  |     /* crc32(), crc32_combine(), inflateInit2(), inflate(), inflateEnd() */ | ||
|  | 
 | ||
|  | #define local static
 | ||
|  | 
 | ||
|  | /* exit with an error (return a value to allow use in an expression) */ | ||
|  | local int bail(char *why1, char *why2) | ||
|  | { | ||
|  |     fprintf(stderr, "gzjoin error: %s%s, output incomplete\n", why1, why2); | ||
|  |     exit(1); | ||
|  |     return 0; | ||
|  | } | ||
|  | 
 | ||
|  | /* -- simple buffered file input with access to the buffer -- */ | ||
|  | 
 | ||
|  | #define CHUNK 32768         /* must be a power of two and fit in unsigned */
 | ||
|  | 
 | ||
|  | /* bin buffered input file type */ | ||
|  | typedef struct { | ||
|  |     char *name;             /* name of file for error messages */ | ||
|  |     int fd;                 /* file descriptor */ | ||
|  |     unsigned left;          /* bytes remaining at next */ | ||
|  |     unsigned char *next;    /* next byte to read */ | ||
|  |     unsigned char *buf;     /* allocated buffer of length CHUNK */ | ||
|  | } bin; | ||
|  | 
 | ||
|  | /* close a buffered file and free allocated memory */ | ||
|  | local void bclose(bin *in) | ||
|  | { | ||
|  |     if (in != NULL) { | ||
|  |         if (in->fd != -1) | ||
|  |             close(in->fd); | ||
|  |         if (in->buf != NULL) | ||
|  |             free(in->buf); | ||
|  |         free(in); | ||
|  |     } | ||
|  | } | ||
|  | 
 | ||
|  | /* open a buffered file for input, return a pointer to type bin, or NULL on
 | ||
|  |    failure */ | ||
|  | local bin *bopen(char *name) | ||
|  | { | ||
|  |     bin *in; | ||
|  | 
 | ||
|  |     in = malloc(sizeof(bin)); | ||
|  |     if (in == NULL) | ||
|  |         return NULL; | ||
|  |     in->buf = malloc(CHUNK); | ||
|  |     in->fd = open(name, O_RDONLY, 0); | ||
|  |     if (in->buf == NULL || in->fd == -1) { | ||
|  |         bclose(in); | ||
|  |         return NULL; | ||
|  |     } | ||
|  |     in->left = 0; | ||
|  |     in->next = in->buf; | ||
|  |     in->name = name; | ||
|  |     return in; | ||
|  | } | ||
|  | 
 | ||
|  | /* load buffer from file, return -1 on read error, 0 or 1 on success, with
 | ||
|  |    1 indicating that end-of-file was reached */ | ||
|  | local int bload(bin *in) | ||
|  | { | ||
|  |     long len; | ||
|  | 
 | ||
|  |     if (in == NULL) | ||
|  |         return -1; | ||
|  |     if (in->left != 0) | ||
|  |         return 0; | ||
|  |     in->next = in->buf; | ||
|  |     do { | ||
|  |         len = (long)read(in->fd, in->buf + in->left, CHUNK - in->left); | ||
|  |         if (len < 0) | ||
|  |             return -1; | ||
|  |         in->left += (unsigned)len; | ||
|  |     } while (len != 0 && in->left < CHUNK); | ||
|  |     return len == 0 ? 1 : 0; | ||
|  | } | ||
|  | 
 | ||
|  | /* get a byte from the file, bail if end of file */ | ||
|  | #define bget(in) (in->left ? 0 : bload(in), \
 | ||
|  |                   in->left ? (in->left--, *(in->next)++) : \ | ||
|  |                     bail("unexpected end of file on ", in->name)) | ||
|  | 
 | ||
|  | /* get a four-byte little-endian unsigned integer from file */ | ||
|  | local unsigned long bget4(bin *in) | ||
|  | { | ||
|  |     unsigned long val; | ||
|  | 
 | ||
|  |     val = bget(in); | ||
|  |     val += (unsigned long)(bget(in)) << 8; | ||
|  |     val += (unsigned long)(bget(in)) << 16; | ||
|  |     val += (unsigned long)(bget(in)) << 24; | ||
|  |     return val; | ||
|  | } | ||
|  | 
 | ||
|  | /* skip bytes in file */ | ||
|  | local void bskip(bin *in, unsigned skip) | ||
|  | { | ||
|  |     /* check pointer */ | ||
|  |     if (in == NULL) | ||
|  |         return; | ||
|  | 
 | ||
|  |     /* easy case -- skip bytes in buffer */ | ||
|  |     if (skip <= in->left) { | ||
|  |         in->left -= skip; | ||
|  |         in->next += skip; | ||
|  |         return; | ||
|  |     } | ||
|  | 
 | ||
|  |     /* skip what's in buffer, discard buffer contents */ | ||
|  |     skip -= in->left; | ||
|  |     in->left = 0; | ||
|  | 
 | ||
|  |     /* seek past multiples of CHUNK bytes */ | ||
|  |     if (skip > CHUNK) { | ||
|  |         unsigned left; | ||
|  | 
 | ||
|  |         left = skip & (CHUNK - 1); | ||
|  |         if (left == 0) { | ||
|  |             /* exact number of chunks: seek all the way minus one byte to check
 | ||
|  |                for end-of-file with a read */ | ||
|  |             lseek(in->fd, skip - 1, SEEK_CUR); | ||
|  |             if (read(in->fd, in->buf, 1) != 1) | ||
|  |                 bail("unexpected end of file on ", in->name); | ||
|  |             return; | ||
|  |         } | ||
|  | 
 | ||
|  |         /* skip the integral chunks, update skip with remainder */ | ||
|  |         lseek(in->fd, skip - left, SEEK_CUR); | ||
|  |         skip = left; | ||
|  |     } | ||
|  | 
 | ||
|  |     /* read more input and skip remainder */ | ||
|  |     bload(in); | ||
|  |     if (skip > in->left) | ||
|  |         bail("unexpected end of file on ", in->name); | ||
|  |     in->left -= skip; | ||
|  |     in->next += skip; | ||
|  | } | ||
|  | 
 | ||
|  | /* -- end of buffered input functions -- */ | ||
|  | 
 | ||
|  | /* skip the gzip header from file in */ | ||
|  | local void gzhead(bin *in) | ||
|  | { | ||
|  |     int flags; | ||
|  | 
 | ||
|  |     /* verify gzip magic header and compression method */ | ||
|  |     if (bget(in) != 0x1f || bget(in) != 0x8b || bget(in) != 8) | ||
|  |         bail(in->name, " is not a valid gzip file"); | ||
|  | 
 | ||
|  |     /* get and verify flags */ | ||
|  |     flags = bget(in); | ||
|  |     if ((flags & 0xe0) != 0) | ||
|  |         bail("unknown reserved bits set in ", in->name); | ||
|  | 
 | ||
|  |     /* skip modification time, extra flags, and os */ | ||
|  |     bskip(in, 6); | ||
|  | 
 | ||
|  |     /* skip extra field if present */ | ||
|  |     if (flags & 4) { | ||
|  |         unsigned len; | ||
|  | 
 | ||
|  |         len = bget(in); | ||
|  |         len += (unsigned)(bget(in)) << 8; | ||
|  |         bskip(in, len); | ||
|  |     } | ||
|  | 
 | ||
|  |     /* skip file name if present */ | ||
|  |     if (flags & 8) | ||
|  |         while (bget(in) != 0) | ||
|  |             ; | ||
|  | 
 | ||
|  |     /* skip comment if present */ | ||
|  |     if (flags & 16) | ||
|  |         while (bget(in) != 0) | ||
|  |             ; | ||
|  | 
 | ||
|  |     /* skip header crc if present */ | ||
|  |     if (flags & 2) | ||
|  |         bskip(in, 2); | ||
|  | } | ||
|  | 
 | ||
|  | /* write a four-byte little-endian unsigned integer to out */ | ||
|  | local void put4(unsigned long val, FILE *out) | ||
|  | { | ||
|  |     putc(val & 0xff, out); | ||
|  |     putc((val >> 8) & 0xff, out); | ||
|  |     putc((val >> 16) & 0xff, out); | ||
|  |     putc((val >> 24) & 0xff, out); | ||
|  | } | ||
|  | 
 | ||
|  | /* Load up zlib stream from buffered input, bail if end of file */ | ||
|  | local void zpull(z_streamp strm, bin *in) | ||
|  | { | ||
|  |     if (in->left == 0) | ||
|  |         bload(in); | ||
|  |     if (in->left == 0) | ||
|  |         bail("unexpected end of file on ", in->name); | ||
|  |     strm->avail_in = in->left; | ||
|  |     strm->next_in = in->next; | ||
|  | } | ||
|  | 
 | ||
|  | /* Write header for gzip file to out and initialize trailer. */ | ||
|  | local void gzinit(unsigned long *crc, unsigned long *tot, FILE *out) | ||
|  | { | ||
|  |     fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out); | ||
|  |     *crc = crc32(0L, Z_NULL, 0); | ||
|  |     *tot = 0; | ||
|  | } | ||
|  | 
 | ||
|  | /* Copy the compressed data from name, zeroing the last block bit of the last
 | ||
|  |    block if clr is true, and adding empty blocks as needed to get to a byte | ||
|  |    boundary.  If clr is false, then the last block becomes the last block of | ||
|  |    the output, and the gzip trailer is written.  crc and tot maintains the | ||
|  |    crc and length (modulo 2^32) of the output for the trailer.  The resulting | ||
|  |    gzip file is written to out.  gzinit() must be called before the first call | ||
|  |    of gzcopy() to write the gzip header and to initialize crc and tot. */ | ||
|  | local void gzcopy(char *name, int clr, unsigned long *crc, unsigned long *tot, | ||
|  |                   FILE *out) | ||
|  | { | ||
|  |     int ret;                /* return value from zlib functions */ | ||
|  |     int pos;                /* where the "last block" bit is in byte */ | ||
|  |     int last;               /* true if processing the last block */ | ||
|  |     bin *in;                /* buffered input file */ | ||
|  |     unsigned char *start;   /* start of compressed data in buffer */ | ||
|  |     unsigned char *junk;    /* buffer for uncompressed data -- discarded */ | ||
|  |     z_off_t len;            /* length of uncompressed data (support > 4 GB) */ | ||
|  |     z_stream strm;          /* zlib inflate stream */ | ||
|  | 
 | ||
|  |     /* open gzip file and skip header */ | ||
|  |     in = bopen(name); | ||
|  |     if (in == NULL) | ||
|  |         bail("could not open ", name); | ||
|  |     gzhead(in); | ||
|  | 
 | ||
|  |     /* allocate buffer for uncompressed data and initialize raw inflate
 | ||
|  |        stream */ | ||
|  |     junk = malloc(CHUNK); | ||
|  |     strm.zalloc = Z_NULL; | ||
|  |     strm.zfree = Z_NULL; | ||
|  |     strm.opaque = Z_NULL; | ||
|  |     strm.avail_in = 0; | ||
|  |     strm.next_in = Z_NULL; | ||
|  |     ret = inflateInit2(&strm, -15); | ||
|  |     if (junk == NULL || ret != Z_OK) | ||
|  |         bail("out of memory", ""); | ||
|  | 
 | ||
|  |     /* inflate and copy compressed data, clear last-block bit if requested */ | ||
|  |     len = 0; | ||
|  |     zpull(&strm, in); | ||
|  |     start = in->next; | ||
|  |     last = start[0] & 1; | ||
|  |     if (last && clr) | ||
|  |         start[0] &= ~1; | ||
|  |     strm.avail_out = 0; | ||
|  |     for (;;) { | ||
|  |         /* if input used and output done, write used input and get more */ | ||
|  |         if (strm.avail_in == 0 && strm.avail_out != 0) { | ||
|  |             fwrite(start, 1, strm.next_in - start, out); | ||
|  |             start = in->buf; | ||
|  |             in->left = 0; | ||
|  |             zpull(&strm, in); | ||
|  |         } | ||
|  | 
 | ||
|  |         /* decompress -- return early when end-of-block reached */ | ||
|  |         strm.avail_out = CHUNK; | ||
|  |         strm.next_out = junk; | ||
|  |         ret = inflate(&strm, Z_BLOCK); | ||
|  |         switch (ret) { | ||
|  |         case Z_MEM_ERROR: | ||
|  |             bail("out of memory", ""); | ||
|  |         case Z_DATA_ERROR: | ||
|  |             bail("invalid compressed data in ", in->name); | ||
|  |         } | ||
|  | 
 | ||
|  |         /* update length of uncompressed data */ | ||
|  |         len += CHUNK - strm.avail_out; | ||
|  | 
 | ||
|  |         /* check for block boundary (only get this when block copied out) */ | ||
|  |         if (strm.data_type & 128) { | ||
|  |             /* if that was the last block, then done */ | ||
|  |             if (last) | ||
|  |                 break; | ||
|  | 
 | ||
|  |             /* number of unused bits in last byte */ | ||
|  |             pos = strm.data_type & 7; | ||
|  | 
 | ||
|  |             /* find the next last-block bit */ | ||
|  |             if (pos != 0) { | ||
|  |                 /* next last-block bit is in last used byte */ | ||
|  |                 pos = 0x100 >> pos; | ||
|  |                 last = strm.next_in[-1] & pos; | ||
|  |                 if (last && clr) | ||
|  |                     in->buf[strm.next_in - in->buf - 1] &= ~pos; | ||
|  |             } | ||
|  |             else { | ||
|  |                 /* next last-block bit is in next unused byte */ | ||
|  |                 if (strm.avail_in == 0) { | ||
|  |                     /* don't have that byte yet -- get it */ | ||
|  |                     fwrite(start, 1, strm.next_in - start, out); | ||
|  |                     start = in->buf; | ||
|  |                     in->left = 0; | ||
|  |                     zpull(&strm, in); | ||
|  |                 } | ||
|  |                 last = strm.next_in[0] & 1; | ||
|  |                 if (last && clr) | ||
|  |                     in->buf[strm.next_in - in->buf] &= ~1; | ||
|  |             } | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  |     /* update buffer with unused input */ | ||
|  |     in->left = strm.avail_in; | ||
|  |     in->next = in->buf + (strm.next_in - in->buf); | ||
|  | 
 | ||
|  |     /* copy used input, write empty blocks to get to byte boundary */ | ||
|  |     pos = strm.data_type & 7; | ||
|  |     fwrite(start, 1, in->next - start - 1, out); | ||
|  |     last = in->next[-1]; | ||
|  |     if (pos == 0 || !clr) | ||
|  |         /* already at byte boundary, or last file: write last byte */ | ||
|  |         putc(last, out); | ||
|  |     else { | ||
|  |         /* append empty blocks to last byte */ | ||
|  |         last &= ((0x100 >> pos) - 1);       /* assure unused bits are zero */ | ||
|  |         if (pos & 1) { | ||
|  |             /* odd -- append an empty stored block */ | ||
|  |             putc(last, out); | ||
|  |             if (pos == 1) | ||
|  |                 putc(0, out);               /* two more bits in block header */ | ||
|  |             fwrite("\0\0\xff\xff", 1, 4, out); | ||
|  |         } | ||
|  |         else { | ||
|  |             /* even -- append 1, 2, or 3 empty fixed blocks */ | ||
|  |             switch (pos) { | ||
|  |             case 6: | ||
|  |                 putc(last | 8, out); | ||
|  |                 last = 0; | ||
|  |             case 4: | ||
|  |                 putc(last | 0x20, out); | ||
|  |                 last = 0; | ||
|  |             case 2: | ||
|  |                 putc(last | 0x80, out); | ||
|  |                 putc(0, out); | ||
|  |             } | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  |     /* update crc and tot */ | ||
|  |     *crc = crc32_combine(*crc, bget4(in), len); | ||
|  |     *tot += (unsigned long)len; | ||
|  | 
 | ||
|  |     /* clean up */ | ||
|  |     inflateEnd(&strm); | ||
|  |     free(junk); | ||
|  |     bclose(in); | ||
|  | 
 | ||
|  |     /* write trailer if this is the last gzip file */ | ||
|  |     if (!clr) { | ||
|  |         put4(*crc, out); | ||
|  |         put4(*tot, out); | ||
|  |     } | ||
|  | } | ||
|  | 
 | ||
|  | /* join the gzip files on the command line, write result to stdout */ | ||
|  | int main(int argc, char **argv) | ||
|  | { | ||
|  |     unsigned long crc, tot;     /* running crc and total uncompressed length */ | ||
|  | 
 | ||
|  |     /* skip command name */ | ||
|  |     argc--; | ||
|  |     argv++; | ||
|  | 
 | ||
|  |     /* show usage if no arguments */ | ||
|  |     if (argc == 0) { | ||
|  |         fputs("gzjoin usage: gzjoin f1.gz [f2.gz [f3.gz ...]] > fjoin.gz\n", | ||
|  |               stderr); | ||
|  |         return 0; | ||
|  |     } | ||
|  | 
 | ||
|  |     /* join gzip files on command line and write to stdout */ | ||
|  |     gzinit(&crc, &tot, stdout); | ||
|  |     while (argc--) | ||
|  |         gzcopy(*argv++, argc, &crc, &tot, stdout); | ||
|  | 
 | ||
|  |     /* done */ | ||
|  |     return 0; | ||
|  | } |