1/*	$NetBSD: gzjoin.c,v 1.1.1.1 2006/01/14 20:11:09 christos Exp $	*/
2
3/* gzjoin -- command to join gzip files into one gzip file
4
5  Copyright (C) 2004 Mark Adler, all rights reserved
6  version 1.0, 11 Dec 2004
7
8  This software is provided 'as-is', without any express or implied
9  warranty.  In no event will the author be held liable for any damages
10  arising from the use of this software.
11
12  Permission is granted to anyone to use this software for any purpose,
13  including commercial applications, and to alter it and redistribute it
14  freely, subject to the following restrictions:
15
16  1. The origin of this software must not be misrepresented; you must not
17     claim that you wrote the original software. If you use this software
18     in a product, an acknowledgment in the product documentation would be
19     appreciated but is not required.
20  2. Altered source versions must be plainly marked as such, and must not be
21     misrepresented as being the original software.
22  3. This notice may not be removed or altered from any source distribution.
23
24  Mark Adler    madler@alumni.caltech.edu
25 */
26
27/*
28 * Change history:
29 *
30 * 1.0  11 Dec 2004     - First version
31 * 1.1  12 Jun 2005     - Changed ssize_t to long for portability
32 */
33
34/*
35   gzjoin takes one or more gzip files on the command line and writes out a
36   single gzip file that will uncompress to the concatenation of the
37   uncompressed data from the individual gzip files.  gzjoin does this without
38   having to recompress any of the data and without having to calculate a new
39   crc32 for the concatenated uncompressed data.  gzjoin does however have to
40   decompress all of the input data in order to find the bits in the compressed
41   data that need to be modified to concatenate the streams.
42
43   gzjoin does not do an integrity check on the input gzip files other than
44   checking the gzip header and decompressing the compressed data.  They are
45   otherwise assumed to be complete and correct.
46
47   Each joint between gzip files removes at least 18 bytes of previous trailer
48   and subsequent header, and inserts an average of about three bytes to the
49   compressed data in order to connect the streams.  The output gzip file
50   has a minimal ten-byte gzip header with no file name or modification time.
51
52   This program was written to illustrate the use of the Z_BLOCK option of
53   inflate() and the crc32_combine() function.  gzjoin will not compile with
54   versions of zlib earlier than 1.2.3.
55 */
56
57#include <stdio.h>      /* fputs(), fprintf(), fwrite(), putc() */
58#include <stdlib.h>     /* exit(), malloc(), free() */
59#include <fcntl.h>      /* open() */
60#include <unistd.h>     /* close(), read(), lseek() */
61#include "zlib.h"
62    /* crc32(), crc32_combine(), inflateInit2(), inflate(), inflateEnd() */
63
64#define local static
65
66/* exit with an error (return a value to allow use in an expression) */
67local int bail(char *why1, char *why2)
68{
69    fprintf(stderr, "gzjoin error: %s%s, output incomplete\n", why1, why2);
70    exit(1);
71    return 0;
72}
73
74/* -- simple buffered file input with access to the buffer -- */
75
76#define CHUNK 32768         /* must be a power of two and fit in unsigned */
77
78/* bin buffered input file type */
79typedef struct {
80    char *name;             /* name of file for error messages */
81    int fd;                 /* file descriptor */
82    unsigned left;          /* bytes remaining at next */
83    unsigned char *next;    /* next byte to read */
84    unsigned char *buf;     /* allocated buffer of length CHUNK */
85} bin;
86
87/* close a buffered file and free allocated memory */
88local void bclose(bin *in)
89{
90    if (in != NULL) {
91        if (in->fd != -1)
92            close(in->fd);
93        if (in->buf != NULL)
94            free(in->buf);
95        free(in);
96    }
97}
98
99/* open a buffered file for input, return a pointer to type bin, or NULL on
100   failure */
101local bin *bopen(char *name)
102{
103    bin *in;
104
105    in = malloc(sizeof(bin));
106    if (in == NULL)
107        return NULL;
108    in->buf = malloc(CHUNK);
109    in->fd = open(name, O_RDONLY, 0);
110    if (in->buf == NULL || in->fd == -1) {
111        bclose(in);
112        return NULL;
113    }
114    in->left = 0;
115    in->next = in->buf;
116    in->name = name;
117    return in;
118}
119
120/* load buffer from file, return -1 on read error, 0 or 1 on success, with
121   1 indicating that end-of-file was reached */
122local int bload(bin *in)
123{
124    long len;
125
126    if (in == NULL)
127        return -1;
128    if (in->left != 0)
129        return 0;
130    in->next = in->buf;
131    do {
132        len = (long)read(in->fd, in->buf + in->left, CHUNK - in->left);
133        if (len < 0)
134            return -1;
135        in->left += (unsigned)len;
136    } while (len != 0 && in->left < CHUNK);
137    return len == 0 ? 1 : 0;
138}
139
140/* get a byte from the file, bail if end of file */
141#define bget(in) (in->left ? 0 : bload(in), \
142                  in->left ? (in->left--, *(in->next)++) : \
143                    bail("unexpected end of file on ", in->name))
144
145/* get a four-byte little-endian unsigned integer from file */
146local unsigned long bget4(bin *in)
147{
148    unsigned long val;
149
150    val = bget(in);
151    val += (unsigned long)(bget(in)) << 8;
152    val += (unsigned long)(bget(in)) << 16;
153    val += (unsigned long)(bget(in)) << 24;
154    return val;
155}
156
157/* skip bytes in file */
158local void bskip(bin *in, unsigned skip)
159{
160    /* check pointer */
161    if (in == NULL)
162        return;
163
164    /* easy case -- skip bytes in buffer */
165    if (skip <= in->left) {
166        in->left -= skip;
167        in->next += skip;
168        return;
169    }
170
171    /* skip what's in buffer, discard buffer contents */
172    skip -= in->left;
173    in->left = 0;
174
175    /* seek past multiples of CHUNK bytes */
176    if (skip > CHUNK) {
177        unsigned left;
178
179        left = skip & (CHUNK - 1);
180        if (left == 0) {
181            /* exact number of chunks: seek all the way minus one byte to check
182               for end-of-file with a read */
183            lseek(in->fd, skip - 1, SEEK_CUR);
184            if (read(in->fd, in->buf, 1) != 1)
185                bail("unexpected end of file on ", in->name);
186            return;
187        }
188
189        /* skip the integral chunks, update skip with remainder */
190        lseek(in->fd, skip - left, SEEK_CUR);
191        skip = left;
192    }
193
194    /* read more input and skip remainder */
195    bload(in);
196    if (skip > in->left)
197        bail("unexpected end of file on ", in->name);
198    in->left -= skip;
199    in->next += skip;
200}
201
202/* -- end of buffered input functions -- */
203
204/* skip the gzip header from file in */
205local void gzhead(bin *in)
206{
207    int flags;
208
209    /* verify gzip magic header and compression method */
210    if (bget(in) != 0x1f || bget(in) != 0x8b || bget(in) != 8)
211        bail(in->name, " is not a valid gzip file");
212
213    /* get and verify flags */
214    flags = bget(in);
215    if ((flags & 0xe0) != 0)
216        bail("unknown reserved bits set in ", in->name);
217
218    /* skip modification time, extra flags, and os */
219    bskip(in, 6);
220
221    /* skip extra field if present */
222    if (flags & 4) {
223        unsigned len;
224
225        len = bget(in);
226        len += (unsigned)(bget(in)) << 8;
227        bskip(in, len);
228    }
229
230    /* skip file name if present */
231    if (flags & 8)
232        while (bget(in) != 0)
233            ;
234
235    /* skip comment if present */
236    if (flags & 16)
237        while (bget(in) != 0)
238            ;
239
240    /* skip header crc if present */
241    if (flags & 2)
242        bskip(in, 2);
243}
244
245/* write a four-byte little-endian unsigned integer to out */
246local void put4(unsigned long val, FILE *out)
247{
248    putc(val & 0xff, out);
249    putc((val >> 8) & 0xff, out);
250    putc((val >> 16) & 0xff, out);
251    putc((val >> 24) & 0xff, out);
252}
253
254/* Load up zlib stream from buffered input, bail if end of file */
255local void zpull(z_streamp strm, bin *in)
256{
257    if (in->left == 0)
258        bload(in);
259    if (in->left == 0)
260        bail("unexpected end of file on ", in->name);
261    strm->avail_in = in->left;
262    strm->next_in = in->next;
263}
264
265/* Write header for gzip file to out and initialize trailer. */
266local void gzinit(unsigned long *crc, unsigned long *tot, FILE *out)
267{
268    fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out);
269    *crc = crc32(0L, Z_NULL, 0);
270    *tot = 0;
271}
272
273/* Copy the compressed data from name, zeroing the last block bit of the last
274   block if clr is true, and adding empty blocks as needed to get to a byte
275   boundary.  If clr is false, then the last block becomes the last block of
276   the output, and the gzip trailer is written.  crc and tot maintains the
277   crc and length (modulo 2^32) of the output for the trailer.  The resulting
278   gzip file is written to out.  gzinit() must be called before the first call
279   of gzcopy() to write the gzip header and to initialize crc and tot. */
280local void gzcopy(char *name, int clr, unsigned long *crc, unsigned long *tot,
281                  FILE *out)
282{
283    int ret;                /* return value from zlib functions */
284    int pos;                /* where the "last block" bit is in byte */
285    int last;               /* true if processing the last block */
286    bin *in;                /* buffered input file */
287    unsigned char *start;   /* start of compressed data in buffer */
288    unsigned char *junk;    /* buffer for uncompressed data -- discarded */
289    z_off_t len;            /* length of uncompressed data (support > 4 GB) */
290    z_stream strm;          /* zlib inflate stream */
291
292    /* open gzip file and skip header */
293    in = bopen(name);
294    if (in == NULL)
295        bail("could not open ", name);
296    gzhead(in);
297
298    /* allocate buffer for uncompressed data and initialize raw inflate
299       stream */
300    junk = malloc(CHUNK);
301    strm.zalloc = Z_NULL;
302    strm.zfree = Z_NULL;
303    strm.opaque = Z_NULL;
304    strm.avail_in = 0;
305    strm.next_in = Z_NULL;
306    ret = inflateInit2(&strm, -15);
307    if (junk == NULL || ret != Z_OK)
308        bail("out of memory", "");
309
310    /* inflate and copy compressed data, clear last-block bit if requested */
311    len = 0;
312    zpull(&strm, in);
313    start = strm.next_in;
314    last = start[0] & 1;
315    if (last && clr)
316        start[0] &= ~1;
317    strm.avail_out = 0;
318    for (;;) {
319        /* if input used and output done, write used input and get more */
320        if (strm.avail_in == 0 && strm.avail_out != 0) {
321            fwrite(start, 1, strm.next_in - start, out);
322            start = in->buf;
323            in->left = 0;
324            zpull(&strm, in);
325        }
326
327        /* decompress -- return early when end-of-block reached */
328        strm.avail_out = CHUNK;
329        strm.next_out = junk;
330        ret = inflate(&strm, Z_BLOCK);
331        switch (ret) {
332        case Z_MEM_ERROR:
333            bail("out of memory", "");
334        case Z_DATA_ERROR:
335            bail("invalid compressed data in ", in->name);
336        }
337
338        /* update length of uncompressed data */
339        len += CHUNK - strm.avail_out;
340
341        /* check for block boundary (only get this when block copied out) */
342        if (strm.data_type & 128) {
343            /* if that was the last block, then done */
344            if (last)
345                break;
346
347            /* number of unused bits in last byte */
348            pos = strm.data_type & 7;
349
350            /* find the next last-block bit */
351            if (pos != 0) {
352                /* next last-block bit is in last used byte */
353                pos = 0x100 >> pos;
354                last = strm.next_in[-1] & pos;
355                if (last && clr)
356                    strm.next_in[-1] &= ~pos;
357            }
358            else {
359                /* next last-block bit is in next unused byte */
360                if (strm.avail_in == 0) {
361                    /* don't have that byte yet -- get it */
362                    fwrite(start, 1, strm.next_in - start, out);
363                    start = in->buf;
364                    in->left = 0;
365                    zpull(&strm, in);
366                }
367                last = strm.next_in[0] & 1;
368                if (last && clr)
369                    strm.next_in[0] &= ~1;
370            }
371        }
372    }
373
374    /* update buffer with unused input */
375    in->left = strm.avail_in;
376    in->next = strm.next_in;
377
378    /* copy used input, write empty blocks to get to byte boundary */
379    pos = strm.data_type & 7;
380    fwrite(start, 1, in->next - start - 1, out);
381    last = in->next[-1];
382    if (pos == 0 || !clr)
383        /* already at byte boundary, or last file: write last byte */
384        putc(last, out);
385    else {
386        /* append empty blocks to last byte */
387        last &= ((0x100 >> pos) - 1);       /* assure unused bits are zero */
388        if (pos & 1) {
389            /* odd -- append an empty stored block */
390            putc(last, out);
391            if (pos == 1)
392                putc(0, out);               /* two more bits in block header */
393            fwrite("\0\0\xff\xff", 1, 4, out);
394        }
395        else {
396            /* even -- append 1, 2, or 3 empty fixed blocks */
397            switch (pos) {
398            case 6:
399                putc(last | 8, out);
400                last = 0;
401            case 4:
402                putc(last | 0x20, out);
403                last = 0;
404            case 2:
405                putc(last | 0x80, out);
406                putc(0, out);
407            }
408        }
409    }
410
411    /* update crc and tot */
412    *crc = crc32_combine(*crc, bget4(in), len);
413    *tot += (unsigned long)len;
414
415    /* clean up */
416    inflateEnd(&strm);
417    free(junk);
418    bclose(in);
419
420    /* write trailer if this is the last gzip file */
421    if (!clr) {
422        put4(*crc, out);
423        put4(*tot, out);
424    }
425}
426
427/* join the gzip files on the command line, write result to stdout */
428int main(int argc, char **argv)
429{
430    unsigned long crc, tot;     /* running crc and total uncompressed length */
431
432    /* skip command name */
433    argc--;
434    argv++;
435
436    /* show usage if no arguments */
437    if (argc == 0) {
438        fputs("gzjoin usage: gzjoin f1.gz [f2.gz [f3.gz ...]] > fjoin.gz\n",
439              stderr);
440        return 0;
441    }
442
443    /* join gzip files on command line and write to stdout */
444    gzinit(&crc, &tot, stdout);
445    while (argc--)
446        gzcopy(*argv++, argc, &crc, &tot, stdout);
447
448    /* done */
449    return 0;
450}
451