1/* $NetBSD: gzjoin.c,v 1.1.1.1 2006/01/14 20:11:09 christos Exp $ */ 2 3/* gzjoin -- command to join gzip files into one gzip file 4 5 Copyright (C) 2004 Mark Adler, all rights reserved 6 version 1.0, 11 Dec 2004 7 8 This software is provided 'as-is', without any express or implied 9 warranty. In no event will the author be held liable for any damages 10 arising from the use of this software. 11 12 Permission is granted to anyone to use this software for any purpose, 13 including commercial applications, and to alter it and redistribute it 14 freely, subject to the following restrictions: 15 16 1. The origin of this software must not be misrepresented; you must not 17 claim that you wrote the original software. If you use this software 18 in a product, an acknowledgment in the product documentation would be 19 appreciated but is not required. 20 2. Altered source versions must be plainly marked as such, and must not be 21 misrepresented as being the original software. 22 3. This notice may not be removed or altered from any source distribution. 23 24 Mark Adler madler@alumni.caltech.edu 25 */ 26 27/* 28 * Change history: 29 * 30 * 1.0 11 Dec 2004 - First version 31 * 1.1 12 Jun 2005 - Changed ssize_t to long for portability 32 */ 33 34/* 35 gzjoin takes one or more gzip files on the command line and writes out a 36 single gzip file that will uncompress to the concatenation of the 37 uncompressed data from the individual gzip files. gzjoin does this without 38 having to recompress any of the data and without having to calculate a new 39 crc32 for the concatenated uncompressed data. gzjoin does however have to 40 decompress all of the input data in order to find the bits in the compressed 41 data that need to be modified to concatenate the streams. 42 43 gzjoin does not do an integrity check on the input gzip files other than 44 checking the gzip header and decompressing the compressed data. They are 45 otherwise assumed to be complete and correct. 46 47 Each joint between gzip files removes at least 18 bytes of previous trailer 48 and subsequent header, and inserts an average of about three bytes to the 49 compressed data in order to connect the streams. The output gzip file 50 has a minimal ten-byte gzip header with no file name or modification time. 51 52 This program was written to illustrate the use of the Z_BLOCK option of 53 inflate() and the crc32_combine() function. gzjoin will not compile with 54 versions of zlib earlier than 1.2.3. 55 */ 56 57#include <stdio.h> /* fputs(), fprintf(), fwrite(), putc() */ 58#include <stdlib.h> /* exit(), malloc(), free() */ 59#include <fcntl.h> /* open() */ 60#include <unistd.h> /* close(), read(), lseek() */ 61#include "zlib.h" 62 /* crc32(), crc32_combine(), inflateInit2(), inflate(), inflateEnd() */ 63 64#define local static 65 66/* exit with an error (return a value to allow use in an expression) */ 67local int bail(char *why1, char *why2) 68{ 69 fprintf(stderr, "gzjoin error: %s%s, output incomplete\n", why1, why2); 70 exit(1); 71 return 0; 72} 73 74/* -- simple buffered file input with access to the buffer -- */ 75 76#define CHUNK 32768 /* must be a power of two and fit in unsigned */ 77 78/* bin buffered input file type */ 79typedef struct { 80 char *name; /* name of file for error messages */ 81 int fd; /* file descriptor */ 82 unsigned left; /* bytes remaining at next */ 83 unsigned char *next; /* next byte to read */ 84 unsigned char *buf; /* allocated buffer of length CHUNK */ 85} bin; 86 87/* close a buffered file and free allocated memory */ 88local void bclose(bin *in) 89{ 90 if (in != NULL) { 91 if (in->fd != -1) 92 close(in->fd); 93 if (in->buf != NULL) 94 free(in->buf); 95 free(in); 96 } 97} 98 99/* open a buffered file for input, return a pointer to type bin, or NULL on 100 failure */ 101local bin *bopen(char *name) 102{ 103 bin *in; 104 105 in = malloc(sizeof(bin)); 106 if (in == NULL) 107 return NULL; 108 in->buf = malloc(CHUNK); 109 in->fd = open(name, O_RDONLY, 0); 110 if (in->buf == NULL || in->fd == -1) { 111 bclose(in); 112 return NULL; 113 } 114 in->left = 0; 115 in->next = in->buf; 116 in->name = name; 117 return in; 118} 119 120/* load buffer from file, return -1 on read error, 0 or 1 on success, with 121 1 indicating that end-of-file was reached */ 122local int bload(bin *in) 123{ 124 long len; 125 126 if (in == NULL) 127 return -1; 128 if (in->left != 0) 129 return 0; 130 in->next = in->buf; 131 do { 132 len = (long)read(in->fd, in->buf + in->left, CHUNK - in->left); 133 if (len < 0) 134 return -1; 135 in->left += (unsigned)len; 136 } while (len != 0 && in->left < CHUNK); 137 return len == 0 ? 1 : 0; 138} 139 140/* get a byte from the file, bail if end of file */ 141#define bget(in) (in->left ? 0 : bload(in), \ 142 in->left ? (in->left--, *(in->next)++) : \ 143 bail("unexpected end of file on ", in->name)) 144 145/* get a four-byte little-endian unsigned integer from file */ 146local unsigned long bget4(bin *in) 147{ 148 unsigned long val; 149 150 val = bget(in); 151 val += (unsigned long)(bget(in)) << 8; 152 val += (unsigned long)(bget(in)) << 16; 153 val += (unsigned long)(bget(in)) << 24; 154 return val; 155} 156 157/* skip bytes in file */ 158local void bskip(bin *in, unsigned skip) 159{ 160 /* check pointer */ 161 if (in == NULL) 162 return; 163 164 /* easy case -- skip bytes in buffer */ 165 if (skip <= in->left) { 166 in->left -= skip; 167 in->next += skip; 168 return; 169 } 170 171 /* skip what's in buffer, discard buffer contents */ 172 skip -= in->left; 173 in->left = 0; 174 175 /* seek past multiples of CHUNK bytes */ 176 if (skip > CHUNK) { 177 unsigned left; 178 179 left = skip & (CHUNK - 1); 180 if (left == 0) { 181 /* exact number of chunks: seek all the way minus one byte to check 182 for end-of-file with a read */ 183 lseek(in->fd, skip - 1, SEEK_CUR); 184 if (read(in->fd, in->buf, 1) != 1) 185 bail("unexpected end of file on ", in->name); 186 return; 187 } 188 189 /* skip the integral chunks, update skip with remainder */ 190 lseek(in->fd, skip - left, SEEK_CUR); 191 skip = left; 192 } 193 194 /* read more input and skip remainder */ 195 bload(in); 196 if (skip > in->left) 197 bail("unexpected end of file on ", in->name); 198 in->left -= skip; 199 in->next += skip; 200} 201 202/* -- end of buffered input functions -- */ 203 204/* skip the gzip header from file in */ 205local void gzhead(bin *in) 206{ 207 int flags; 208 209 /* verify gzip magic header and compression method */ 210 if (bget(in) != 0x1f || bget(in) != 0x8b || bget(in) != 8) 211 bail(in->name, " is not a valid gzip file"); 212 213 /* get and verify flags */ 214 flags = bget(in); 215 if ((flags & 0xe0) != 0) 216 bail("unknown reserved bits set in ", in->name); 217 218 /* skip modification time, extra flags, and os */ 219 bskip(in, 6); 220 221 /* skip extra field if present */ 222 if (flags & 4) { 223 unsigned len; 224 225 len = bget(in); 226 len += (unsigned)(bget(in)) << 8; 227 bskip(in, len); 228 } 229 230 /* skip file name if present */ 231 if (flags & 8) 232 while (bget(in) != 0) 233 ; 234 235 /* skip comment if present */ 236 if (flags & 16) 237 while (bget(in) != 0) 238 ; 239 240 /* skip header crc if present */ 241 if (flags & 2) 242 bskip(in, 2); 243} 244 245/* write a four-byte little-endian unsigned integer to out */ 246local void put4(unsigned long val, FILE *out) 247{ 248 putc(val & 0xff, out); 249 putc((val >> 8) & 0xff, out); 250 putc((val >> 16) & 0xff, out); 251 putc((val >> 24) & 0xff, out); 252} 253 254/* Load up zlib stream from buffered input, bail if end of file */ 255local void zpull(z_streamp strm, bin *in) 256{ 257 if (in->left == 0) 258 bload(in); 259 if (in->left == 0) 260 bail("unexpected end of file on ", in->name); 261 strm->avail_in = in->left; 262 strm->next_in = in->next; 263} 264 265/* Write header for gzip file to out and initialize trailer. */ 266local void gzinit(unsigned long *crc, unsigned long *tot, FILE *out) 267{ 268 fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out); 269 *crc = crc32(0L, Z_NULL, 0); 270 *tot = 0; 271} 272 273/* Copy the compressed data from name, zeroing the last block bit of the last 274 block if clr is true, and adding empty blocks as needed to get to a byte 275 boundary. If clr is false, then the last block becomes the last block of 276 the output, and the gzip trailer is written. crc and tot maintains the 277 crc and length (modulo 2^32) of the output for the trailer. The resulting 278 gzip file is written to out. gzinit() must be called before the first call 279 of gzcopy() to write the gzip header and to initialize crc and tot. */ 280local void gzcopy(char *name, int clr, unsigned long *crc, unsigned long *tot, 281 FILE *out) 282{ 283 int ret; /* return value from zlib functions */ 284 int pos; /* where the "last block" bit is in byte */ 285 int last; /* true if processing the last block */ 286 bin *in; /* buffered input file */ 287 unsigned char *start; /* start of compressed data in buffer */ 288 unsigned char *junk; /* buffer for uncompressed data -- discarded */ 289 z_off_t len; /* length of uncompressed data (support > 4 GB) */ 290 z_stream strm; /* zlib inflate stream */ 291 292 /* open gzip file and skip header */ 293 in = bopen(name); 294 if (in == NULL) 295 bail("could not open ", name); 296 gzhead(in); 297 298 /* allocate buffer for uncompressed data and initialize raw inflate 299 stream */ 300 junk = malloc(CHUNK); 301 strm.zalloc = Z_NULL; 302 strm.zfree = Z_NULL; 303 strm.opaque = Z_NULL; 304 strm.avail_in = 0; 305 strm.next_in = Z_NULL; 306 ret = inflateInit2(&strm, -15); 307 if (junk == NULL || ret != Z_OK) 308 bail("out of memory", ""); 309 310 /* inflate and copy compressed data, clear last-block bit if requested */ 311 len = 0; 312 zpull(&strm, in); 313 start = strm.next_in; 314 last = start[0] & 1; 315 if (last && clr) 316 start[0] &= ~1; 317 strm.avail_out = 0; 318 for (;;) { 319 /* if input used and output done, write used input and get more */ 320 if (strm.avail_in == 0 && strm.avail_out != 0) { 321 fwrite(start, 1, strm.next_in - start, out); 322 start = in->buf; 323 in->left = 0; 324 zpull(&strm, in); 325 } 326 327 /* decompress -- return early when end-of-block reached */ 328 strm.avail_out = CHUNK; 329 strm.next_out = junk; 330 ret = inflate(&strm, Z_BLOCK); 331 switch (ret) { 332 case Z_MEM_ERROR: 333 bail("out of memory", ""); 334 case Z_DATA_ERROR: 335 bail("invalid compressed data in ", in->name); 336 } 337 338 /* update length of uncompressed data */ 339 len += CHUNK - strm.avail_out; 340 341 /* check for block boundary (only get this when block copied out) */ 342 if (strm.data_type & 128) { 343 /* if that was the last block, then done */ 344 if (last) 345 break; 346 347 /* number of unused bits in last byte */ 348 pos = strm.data_type & 7; 349 350 /* find the next last-block bit */ 351 if (pos != 0) { 352 /* next last-block bit is in last used byte */ 353 pos = 0x100 >> pos; 354 last = strm.next_in[-1] & pos; 355 if (last && clr) 356 strm.next_in[-1] &= ~pos; 357 } 358 else { 359 /* next last-block bit is in next unused byte */ 360 if (strm.avail_in == 0) { 361 /* don't have that byte yet -- get it */ 362 fwrite(start, 1, strm.next_in - start, out); 363 start = in->buf; 364 in->left = 0; 365 zpull(&strm, in); 366 } 367 last = strm.next_in[0] & 1; 368 if (last && clr) 369 strm.next_in[0] &= ~1; 370 } 371 } 372 } 373 374 /* update buffer with unused input */ 375 in->left = strm.avail_in; 376 in->next = strm.next_in; 377 378 /* copy used input, write empty blocks to get to byte boundary */ 379 pos = strm.data_type & 7; 380 fwrite(start, 1, in->next - start - 1, out); 381 last = in->next[-1]; 382 if (pos == 0 || !clr) 383 /* already at byte boundary, or last file: write last byte */ 384 putc(last, out); 385 else { 386 /* append empty blocks to last byte */ 387 last &= ((0x100 >> pos) - 1); /* assure unused bits are zero */ 388 if (pos & 1) { 389 /* odd -- append an empty stored block */ 390 putc(last, out); 391 if (pos == 1) 392 putc(0, out); /* two more bits in block header */ 393 fwrite("\0\0\xff\xff", 1, 4, out); 394 } 395 else { 396 /* even -- append 1, 2, or 3 empty fixed blocks */ 397 switch (pos) { 398 case 6: 399 putc(last | 8, out); 400 last = 0; 401 case 4: 402 putc(last | 0x20, out); 403 last = 0; 404 case 2: 405 putc(last | 0x80, out); 406 putc(0, out); 407 } 408 } 409 } 410 411 /* update crc and tot */ 412 *crc = crc32_combine(*crc, bget4(in), len); 413 *tot += (unsigned long)len; 414 415 /* clean up */ 416 inflateEnd(&strm); 417 free(junk); 418 bclose(in); 419 420 /* write trailer if this is the last gzip file */ 421 if (!clr) { 422 put4(*crc, out); 423 put4(*tot, out); 424 } 425} 426 427/* join the gzip files on the command line, write result to stdout */ 428int main(int argc, char **argv) 429{ 430 unsigned long crc, tot; /* running crc and total uncompressed length */ 431 432 /* skip command name */ 433 argc--; 434 argv++; 435 436 /* show usage if no arguments */ 437 if (argc == 0) { 438 fputs("gzjoin usage: gzjoin f1.gz [f2.gz [f3.gz ...]] > fjoin.gz\n", 439 stderr); 440 return 0; 441 } 442 443 /* join gzip files on command line and write to stdout */ 444 gzinit(&crc, &tot, stdout); 445 while (argc--) 446 gzcopy(*argv++, argc, &crc, &tot, stdout); 447 448 /* done */ 449 return 0; 450} 451