1/*	$NetBSD: zran.c,v 1.1.1.1 2006/01/14 20:11:10 christos Exp $	*/
2
3/* zran.c -- example of zlib/gzip stream indexing and random access
4 * Copyright (C) 2005 Mark Adler
5 * For conditions of distribution and use, see copyright notice in zlib.h
6   Version 1.0  29 May 2005  Mark Adler */
7
8/* Illustrate the use of Z_BLOCK, inflatePrime(), and inflateSetDictionary()
9   for random access of a compressed file.  A file containing a zlib or gzip
10   stream is provided on the command line.  The compressed stream is decoded in
11   its entirety, and an index built with access points about every SPAN bytes
12   in the uncompressed output.  The compressed file is left open, and can then
13   be read randomly, having to decompress on the average SPAN/2 uncompressed
14   bytes before getting to the desired block of data.
15
16   An access point can be created at the start of any deflate block, by saving
17   the starting file offset and bit of that block, and the 32K bytes of
18   uncompressed data that precede that block.  Also the uncompressed offset of
19   that block is saved to provide a referece for locating a desired starting
20   point in the uncompressed stream.  build_index() works by decompressing the
21   input zlib or gzip stream a block at a time, and at the end of each block
22   deciding if enough uncompressed data has gone by to justify the creation of
23   a new access point.  If so, that point is saved in a data structure that
24   grows as needed to accommodate the points.
25
26   To use the index, an offset in the uncompressed data is provided, for which
27   the latest accees point at or preceding that offset is located in the index.
28   The input file is positioned to the specified location in the index, and if
29   necessary the first few bits of the compressed data is read from the file.
30   inflate is initialized with those bits and the 32K of uncompressed data, and
31   the decompression then proceeds until the desired offset in the file is
32   reached.  Then the decompression continues to read the desired uncompressed
33   data from the file.
34
35   Another approach would be to generate the index on demand.  In that case,
36   requests for random access reads from the compressed data would try to use
37   the index, but if a read far enough past the end of the index is required,
38   then further index entries would be generated and added.
39
40   There is some fair bit of overhead to starting inflation for the random
41   access, mainly copying the 32K byte dictionary.  So if small pieces of the
42   file are being accessed, it would make sense to implement a cache to hold
43   some lookahead and avoid many calls to extract() for small lengths.
44
45   Another way to build an index would be to use inflateCopy().  That would
46   not be constrained to have access points at block boundaries, but requires
47   more memory per access point, and also cannot be saved to file due to the
48   use of pointers in the state.  The approach here allows for storage of the
49   index in a file.
50 */
51
52#include <stdio.h>
53#include <stdlib.h>
54#include <string.h>
55#include "zlib.h"
56
57#define local static
58
59#define SPAN 1048576L       /* desired distance between access points */
60#define WINSIZE 32768U      /* sliding window size */
61#define CHUNK 16384         /* file input buffer size */
62
63/* access point entry */
64struct point {
65    off_t out;          /* corresponding offset in uncompressed data */
66    off_t in;           /* offset in input file of first full byte */
67    int bits;           /* number of bits (1-7) from byte at in - 1, or 0 */
68    unsigned char window[WINSIZE];  /* preceding 32K of uncompressed data */
69};
70
71/* access point list */
72struct access {
73    int have;           /* number of list entries filled in */
74    int size;           /* number of list entries allocated */
75    struct point *list; /* allocated list */
76};
77
78/* Deallocate an index built by build_index() */
79local void free_index(struct access *index)
80{
81    if (index != NULL) {
82        free(index->list);
83        free(index);
84    }
85}
86
87/* Add an entry to the access point list.  If out of memory, deallocate the
88   existing list and return NULL. */
89local struct access *addpoint(struct access *index, int bits,
90    off_t in, off_t out, unsigned left, unsigned char *window)
91{
92    struct point *next;
93
94    /* if list is empty, create it (start with eight points) */
95    if (index == NULL) {
96        index = malloc(sizeof(struct access));
97        if (index == NULL) return NULL;
98        index->list = malloc(sizeof(struct point) << 3);
99        if (index->list == NULL) {
100            free(index);
101            return NULL;
102        }
103        index->size = 8;
104        index->have = 0;
105    }
106
107    /* if list is full, make it bigger */
108    else if (index->have == index->size) {
109        index->size <<= 1;
110        next = realloc(index->list, sizeof(struct point) * index->size);
111        if (next == NULL) {
112            free_index(index);
113            return NULL;
114        }
115        index->list = next;
116    }
117
118    /* fill in entry and increment how many we have */
119    next = index->list + index->have;
120    next->bits = bits;
121    next->in = in;
122    next->out = out;
123    if (left)
124        memcpy(next->window, window + WINSIZE - left, left);
125    if (left < WINSIZE)
126        memcpy(next->window + left, window, WINSIZE - left);
127    index->have++;
128
129    /* return list, possibly reallocated */
130    return index;
131}
132
133/* Make one entire pass through the compressed stream and build an index, with
134   access points about every span bytes of uncompressed output -- span is
135   chosen to balance the speed of random access against the memory requirements
136   of the list, about 32K bytes per access point.  Note that data after the end
137   of the first zlib or gzip stream in the file is ignored.  build_index()
138   returns the number of access points on success (>= 1), Z_MEM_ERROR for out
139   of memory, Z_DATA_ERROR for an error in the input file, or Z_ERRNO for a
140   file read error.  On success, *built points to the resulting index. */
141local int build_index(FILE *in, off_t span, struct access **built)
142{
143    int ret;
144    off_t totin, totout;        /* our own total counters to avoid 4GB limit */
145    off_t last;                 /* totout value of last access point */
146    struct access *index;       /* access points being generated */
147    z_stream strm;
148    unsigned char input[CHUNK];
149    unsigned char window[WINSIZE];
150
151    /* initialize inflate */
152    strm.zalloc = Z_NULL;
153    strm.zfree = Z_NULL;
154    strm.opaque = Z_NULL;
155    strm.avail_in = 0;
156    strm.next_in = Z_NULL;
157    ret = inflateInit2(&strm, 47);      /* automatic zlib or gzip decoding */
158    if (ret != Z_OK)
159        return ret;
160
161    /* inflate the input, maintain a sliding window, and build an index -- this
162       also validates the integrity of the compressed data using the check
163       information at the end of the gzip or zlib stream */
164    totin = totout = last = 0;
165    index = NULL;               /* will be allocated by first addpoint() */
166    strm.avail_out = 0;
167    do {
168        /* get some compressed data from input file */
169        strm.avail_in = fread(input, 1, CHUNK, in);
170        if (ferror(in)) {
171            ret = Z_ERRNO;
172            goto build_index_error;
173        }
174        if (strm.avail_in == 0) {
175            ret = Z_DATA_ERROR;
176            goto build_index_error;
177        }
178        strm.next_in = input;
179
180        /* process all of that, or until end of stream */
181        do {
182            /* reset sliding window if necessary */
183            if (strm.avail_out == 0) {
184                strm.avail_out = WINSIZE;
185                strm.next_out = window;
186            }
187
188            /* inflate until out of input, output, or at end of block --
189               update the total input and output counters */
190            totin += strm.avail_in;
191            totout += strm.avail_out;
192            ret = inflate(&strm, Z_BLOCK);      /* return at end of block */
193            totin -= strm.avail_in;
194            totout -= strm.avail_out;
195            if (ret == Z_NEED_DICT)
196                ret = Z_DATA_ERROR;
197            if (ret == Z_MEM_ERROR || ret == Z_DATA_ERROR)
198                goto build_index_error;
199            if (ret == Z_STREAM_END)
200                break;
201
202            /* if at end of block, consider adding an index entry (note that if
203               data_type indicates an end-of-block, then all of the
204               uncompressed data from that block has been delivered, and none
205               of the compressed data after that block has been consumed,
206               except for up to seven bits) -- the totout == 0 provides an
207               entry point after the zlib or gzip header, and assures that the
208               index always has at least one access point; we avoid creating an
209               access point after the last block by checking bit 6 of data_type
210             */
211            if ((strm.data_type & 128) && !(strm.data_type & 64) &&
212                (totout == 0 || totout - last > span)) {
213                index = addpoint(index, strm.data_type & 7, totin,
214                                 totout, strm.avail_out, window);
215                if (index == NULL) {
216                    ret = Z_MEM_ERROR;
217                    goto build_index_error;
218                }
219                last = totout;
220            }
221        } while (strm.avail_in != 0);
222    } while (ret != Z_STREAM_END);
223
224    /* clean up and return index (release unused entries in list) */
225    (void)inflateEnd(&strm);
226    index = realloc(index, sizeof(struct point) * index->have);
227    index->size = index->have;
228    *built = index;
229    return index->size;
230
231    /* return error */
232  build_index_error:
233    (void)inflateEnd(&strm);
234    if (index != NULL)
235        free_index(index);
236    return ret;
237}
238
239/* Use the index to read len bytes from offset into buf, return bytes read or
240   negative for error (Z_DATA_ERROR or Z_MEM_ERROR).  If data is requested past
241   the end of the uncompressed data, then extract() will return a value less
242   than len, indicating how much as actually read into buf.  This function
243   should not return a data error unless the file was modified since the index
244   was generated.  extract() may also return Z_ERRNO if there is an error on
245   reading or seeking the input file. */
246local int extract(FILE *in, struct access *index, off_t offset,
247                  unsigned char *buf, int len)
248{
249    int ret, skip;
250    z_stream strm;
251    struct point *here;
252    unsigned char input[CHUNK];
253    unsigned char discard[WINSIZE];
254
255    /* proceed only if something reasonable to do */
256    if (len < 0)
257        return 0;
258
259    /* find where in stream to start */
260    here = index->list;
261    ret = index->have;
262    while (--ret && here[1].out <= offset)
263        here++;
264
265    /* initialize file and inflate state to start there */
266    strm.zalloc = Z_NULL;
267    strm.zfree = Z_NULL;
268    strm.opaque = Z_NULL;
269    strm.avail_in = 0;
270    strm.next_in = Z_NULL;
271    ret = inflateInit2(&strm, -15);         /* raw inflate */
272    if (ret != Z_OK)
273        return ret;
274    ret = fseeko(in, here->in - (here->bits ? 1 : 0), SEEK_SET);
275    if (ret == -1)
276        goto extract_ret;
277    if (here->bits) {
278        ret = getc(in);
279        if (ret == -1) {
280            ret = ferror(in) ? Z_ERRNO : Z_DATA_ERROR;
281            goto extract_ret;
282        }
283        (void)inflatePrime(&strm, here->bits, ret >> (8 - here->bits));
284    }
285    (void)inflateSetDictionary(&strm, here->window, WINSIZE);
286
287    /* skip uncompressed bytes until offset reached, then satisfy request */
288    offset -= here->out;
289    strm.avail_in = 0;
290    skip = 1;                               /* while skipping to offset */
291    do {
292        /* define where to put uncompressed data, and how much */
293        if (offset == 0 && skip) {          /* at offset now */
294            strm.avail_out = len;
295            strm.next_out = buf;
296            skip = 0;                       /* only do this once */
297        }
298        if (offset > WINSIZE) {             /* skip WINSIZE bytes */
299            strm.avail_out = WINSIZE;
300            strm.next_out = discard;
301            offset -= WINSIZE;
302        }
303        else if (offset != 0) {             /* last skip */
304            strm.avail_out = (unsigned)offset;
305            strm.next_out = discard;
306            offset = 0;
307        }
308
309        /* uncompress until avail_out filled, or end of stream */
310        do {
311            if (strm.avail_in == 0) {
312                strm.avail_in = fread(input, 1, CHUNK, in);
313                if (ferror(in)) {
314                    ret = Z_ERRNO;
315                    goto extract_ret;
316                }
317                if (strm.avail_in == 0) {
318                    ret = Z_DATA_ERROR;
319                    goto extract_ret;
320                }
321                strm.next_in = input;
322            }
323            ret = inflate(&strm, Z_NO_FLUSH);       /* normal inflate */
324            if (ret == Z_NEED_DICT)
325                ret = Z_DATA_ERROR;
326            if (ret == Z_MEM_ERROR || ret == Z_DATA_ERROR)
327                goto extract_ret;
328            if (ret == Z_STREAM_END)
329                break;
330        } while (strm.avail_out != 0);
331
332        /* if reach end of stream, then don't keep trying to get more */
333        if (ret == Z_STREAM_END)
334            break;
335
336        /* do until offset reached and requested data read, or stream ends */
337    } while (skip);
338
339    /* compute number of uncompressed bytes read after offset */
340    ret = skip ? 0 : len - strm.avail_out;
341
342    /* clean up and return bytes read or error */
343  extract_ret:
344    (void)inflateEnd(&strm);
345    return ret;
346}
347
348/* Demonstrate the use of build_index() and extract() by processing the file
349   provided on the command line, and the extracting 16K from about 2/3rds of
350   the way through the uncompressed output, and writing that to stdout. */
351int main(int argc, char **argv)
352{
353    int len;
354    off_t offset;
355    FILE *in;
356    struct access *index;
357    unsigned char buf[CHUNK];
358
359    /* open input file */
360    if (argc != 2) {
361        fprintf(stderr, "usage: zran file.gz\n");
362        return 1;
363    }
364    in = fopen(argv[1], "rb");
365    if (in == NULL) {
366        fprintf(stderr, "zran: could not open %s for reading\n", argv[1]);
367        return 1;
368    }
369
370    /* build index */
371    len = build_index(in, SPAN, &index);
372    if (len < 0) {
373        fclose(in);
374        switch (len) {
375        case Z_MEM_ERROR:
376            fprintf(stderr, "zran: out of memory\n");
377            break;
378        case Z_DATA_ERROR:
379            fprintf(stderr, "zran: compressed data error in %s\n", argv[1]);
380            break;
381        case Z_ERRNO:
382            fprintf(stderr, "zran: read error on %s\n", argv[1]);
383            break;
384        default:
385            fprintf(stderr, "zran: error %d while building index\n", len);
386        }
387        return 1;
388    }
389    fprintf(stderr, "zran: built index with %d access points\n", len);
390
391    /* use index by reading some bytes from an arbitrary offset */
392    offset = (index->list[index->have - 1].out << 1) / 3;
393    len = extract(in, index, offset, buf, CHUNK);
394    if (len < 0)
395        fprintf(stderr, "zran: extraction failed: %s error\n",
396                len == Z_MEM_ERROR ? "out of memory" : "input corrupted");
397    else {
398        fwrite(buf, 1, len, stdout);
399        fprintf(stderr, "zran: extracted %d bytes at %llu\n", len, offset);
400    }
401
402    /* clean up and exit */
403    free_index(index);
404    fclose(in);
405    return 0;
406}
407