1171169Smlaier/*	$NetBSD$	*/
2171169Smlaier
3171169Smlaier/* zran.c -- example of zlib/gzip stream indexing and random access
4171169Smlaier * Copyright (C) 2005 Mark Adler
5171169Smlaier * For conditions of distribution and use, see copyright notice in zlib.h
6171169Smlaier   Version 1.0  29 May 2005  Mark Adler */
7171169Smlaier
8171169Smlaier/* Illustrate the use of Z_BLOCK, inflatePrime(), and inflateSetDictionary()
9171169Smlaier   for random access of a compressed file.  A file containing a zlib or gzip
10171169Smlaier   stream is provided on the command line.  The compressed stream is decoded in
11171169Smlaier   its entirety, and an index built with access points about every SPAN bytes
12171169Smlaier   in the uncompressed output.  The compressed file is left open, and can then
13171169Smlaier   be read randomly, having to decompress on the average SPAN/2 uncompressed
14171169Smlaier   bytes before getting to the desired block of data.
15171169Smlaier
16171169Smlaier   An access point can be created at the start of any deflate block, by saving
17171169Smlaier   the starting file offset and bit of that block, and the 32K bytes of
18171169Smlaier   uncompressed data that precede that block.  Also the uncompressed offset of
19171169Smlaier   that block is saved to provide a referece for locating a desired starting
20171169Smlaier   point in the uncompressed stream.  build_index() works by decompressing the
21171169Smlaier   input zlib or gzip stream a block at a time, and at the end of each block
22171169Smlaier   deciding if enough uncompressed data has gone by to justify the creation of
23171169Smlaier   a new access point.  If so, that point is saved in a data structure that
24171169Smlaier   grows as needed to accommodate the points.
25171169Smlaier
26171169Smlaier   To use the index, an offset in the uncompressed data is provided, for which
27171169Smlaier   the latest accees point at or preceding that offset is located in the index.
28171169Smlaier   The input file is positioned to the specified location in the index, and if
29171169Smlaier   necessary the first few bits of the compressed data is read from the file.
30171169Smlaier   inflate is initialized with those bits and the 32K of uncompressed data, and
31171169Smlaier   the decompression then proceeds until the desired offset in the file is
32171169Smlaier   reached.  Then the decompression continues to read the desired uncompressed
33171169Smlaier   data from the file.
34171169Smlaier
35171169Smlaier   Another approach would be to generate the index on demand.  In that case,
36171169Smlaier   requests for random access reads from the compressed data would try to use
37171169Smlaier   the index, but if a read far enough past the end of the index is required,
38171169Smlaier   then further index entries would be generated and added.
39171169Smlaier
40171169Smlaier   There is some fair bit of overhead to starting inflation for the random
41171169Smlaier   access, mainly copying the 32K byte dictionary.  So if small pieces of the
42171169Smlaier   file are being accessed, it would make sense to implement a cache to hold
43171169Smlaier   some lookahead and avoid many calls to extract() for small lengths.
44171169Smlaier
45171169Smlaier   Another way to build an index would be to use inflateCopy().  That would
46171169Smlaier   not be constrained to have access points at block boundaries, but requires
47171169Smlaier   more memory per access point, and also cannot be saved to file due to the
48171169Smlaier   use of pointers in the state.  The approach here allows for storage of the
49171169Smlaier   index in a file.
50171169Smlaier */
51171169Smlaier
52171169Smlaier#include <stdio.h>
53171169Smlaier#include <stdlib.h>
54171169Smlaier#include <string.h>
55171169Smlaier#include "zlib.h"
56171169Smlaier
57171169Smlaier#define local static
58171169Smlaier
59171169Smlaier#define SPAN 1048576L       /* desired distance between access points */
60171169Smlaier#define WINSIZE 32768U      /* sliding window size */
61171169Smlaier#define CHUNK 16384         /* file input buffer size */
62171169Smlaier
63171169Smlaier/* access point entry */
64171169Smlaierstruct point {
65171169Smlaier    off_t out;          /* corresponding offset in uncompressed data */
66171169Smlaier    off_t in;           /* offset in input file of first full byte */
67171169Smlaier    int bits;           /* number of bits (1-7) from byte at in - 1, or 0 */
68171169Smlaier    unsigned char window[WINSIZE];  /* preceding 32K of uncompressed data */
69171169Smlaier};
70171169Smlaier
71171169Smlaier/* access point list */
72171169Smlaierstruct access {
73171169Smlaier    int have;           /* number of list entries filled in */
74171169Smlaier    int size;           /* number of list entries allocated */
75171169Smlaier    struct point *list; /* allocated list */
76171169Smlaier};
77171169Smlaier
78171169Smlaier/* Deallocate an index built by build_index() */
79171169Smlaierlocal void free_index(struct access *index)
80171169Smlaier{
81171169Smlaier    if (index != NULL) {
82171169Smlaier        free(index->list);
83171169Smlaier        free(index);
84171169Smlaier    }
85171169Smlaier}
86171169Smlaier
87171169Smlaier/* Add an entry to the access point list.  If out of memory, deallocate the
88171169Smlaier   existing list and return NULL. */
89171169Smlaierlocal struct access *addpoint(struct access *index, int bits,
90171169Smlaier    off_t in, off_t out, unsigned left, unsigned char *window)
91171169Smlaier{
92171169Smlaier    struct point *next;
93171169Smlaier
94171169Smlaier    /* if list is empty, create it (start with eight points) */
95171169Smlaier    if (index == NULL) {
96171169Smlaier        index = malloc(sizeof(struct access));
97171169Smlaier        if (index == NULL) return NULL;
98171169Smlaier        index->list = malloc(sizeof(struct point) << 3);
99171169Smlaier        if (index->list == NULL) {
100171169Smlaier            free(index);
101171169Smlaier            return NULL;
102171169Smlaier        }
103171169Smlaier        index->size = 8;
104171169Smlaier        index->have = 0;
105171169Smlaier    }
106171169Smlaier
107171169Smlaier    /* if list is full, make it bigger */
108171169Smlaier    else if (index->have == index->size) {
109171169Smlaier        index->size <<= 1;
110171169Smlaier        next = realloc(index->list, sizeof(struct point) * index->size);
111171169Smlaier        if (next == NULL) {
112171169Smlaier            free_index(index);
113171169Smlaier            return NULL;
114171169Smlaier        }
115171169Smlaier        index->list = next;
116171169Smlaier    }
117171169Smlaier
118171169Smlaier    /* fill in entry and increment how many we have */
119171169Smlaier    next = index->list + index->have;
120171169Smlaier    next->bits = bits;
121171169Smlaier    next->in = in;
122171169Smlaier    next->out = out;
123171169Smlaier    if (left)
124171169Smlaier        memcpy(next->window, window + WINSIZE - left, left);
125171169Smlaier    if (left < WINSIZE)
126171169Smlaier        memcpy(next->window + left, window, WINSIZE - left);
127171169Smlaier    index->have++;
128171169Smlaier
129171169Smlaier    /* return list, possibly reallocated */
130171169Smlaier    return index;
131171169Smlaier}
132171169Smlaier
133171169Smlaier/* Make one entire pass through the compressed stream and build an index, with
134171169Smlaier   access points about every span bytes of uncompressed output -- span is
135171169Smlaier   chosen to balance the speed of random access against the memory requirements
136171169Smlaier   of the list, about 32K bytes per access point.  Note that data after the end
137171169Smlaier   of the first zlib or gzip stream in the file is ignored.  build_index()
138171169Smlaier   returns the number of access points on success (>= 1), Z_MEM_ERROR for out
139171169Smlaier   of memory, Z_DATA_ERROR for an error in the input file, or Z_ERRNO for a
140171169Smlaier   file read error.  On success, *built points to the resulting index. */
141171169Smlaierlocal int build_index(FILE *in, off_t span, struct access **built)
142171169Smlaier{
143171169Smlaier    int ret;
144171169Smlaier    off_t totin, totout;        /* our own total counters to avoid 4GB limit */
145171169Smlaier    off_t last;                 /* totout value of last access point */
146171169Smlaier    struct access *index;       /* access points being generated */
147171169Smlaier    z_stream strm;
148171169Smlaier    unsigned char input[CHUNK];
149171169Smlaier    unsigned char window[WINSIZE];
150171169Smlaier
151171169Smlaier    /* initialize inflate */
152171169Smlaier    strm.zalloc = Z_NULL;
153171169Smlaier    strm.zfree = Z_NULL;
154171169Smlaier    strm.opaque = Z_NULL;
155171169Smlaier    strm.avail_in = 0;
156171169Smlaier    strm.next_in = Z_NULL;
157171169Smlaier    ret = inflateInit2(&strm, 47);      /* automatic zlib or gzip decoding */
158171169Smlaier    if (ret != Z_OK)
159171169Smlaier        return ret;
160171169Smlaier
161171169Smlaier    /* inflate the input, maintain a sliding window, and build an index -- this
162171169Smlaier       also validates the integrity of the compressed data using the check
163171169Smlaier       information at the end of the gzip or zlib stream */
164171169Smlaier    totin = totout = last = 0;
165171169Smlaier    index = NULL;               /* will be allocated by first addpoint() */
166171169Smlaier    strm.avail_out = 0;
167171169Smlaier    do {
168171169Smlaier        /* get some compressed data from input file */
169171169Smlaier        strm.avail_in = fread(input, 1, CHUNK, in);
170171169Smlaier        if (ferror(in)) {
171171169Smlaier            ret = Z_ERRNO;
172171169Smlaier            goto build_index_error;
173171169Smlaier        }
174171169Smlaier        if (strm.avail_in == 0) {
175171169Smlaier            ret = Z_DATA_ERROR;
176171169Smlaier            goto build_index_error;
177171169Smlaier        }
178171169Smlaier        strm.next_in = input;
179171169Smlaier
180171169Smlaier        /* process all of that, or until end of stream */
181171169Smlaier        do {
182171169Smlaier            /* reset sliding window if necessary */
183171169Smlaier            if (strm.avail_out == 0) {
184171169Smlaier                strm.avail_out = WINSIZE;
185171169Smlaier                strm.next_out = window;
186171169Smlaier            }
187171169Smlaier
188171169Smlaier            /* inflate until out of input, output, or at end of block --
189171169Smlaier               update the total input and output counters */
190171169Smlaier            totin += strm.avail_in;
191171169Smlaier            totout += strm.avail_out;
192171169Smlaier            ret = inflate(&strm, Z_BLOCK);      /* return at end of block */
193171169Smlaier            totin -= strm.avail_in;
194171169Smlaier            totout -= strm.avail_out;
195171169Smlaier            if (ret == Z_NEED_DICT)
196171169Smlaier                ret = Z_DATA_ERROR;
197171169Smlaier            if (ret == Z_MEM_ERROR || ret == Z_DATA_ERROR)
198171169Smlaier                goto build_index_error;
199171169Smlaier            if (ret == Z_STREAM_END)
200171169Smlaier                break;
201171169Smlaier
202171169Smlaier            /* if at end of block, consider adding an index entry (note that if
203171169Smlaier               data_type indicates an end-of-block, then all of the
204171169Smlaier               uncompressed data from that block has been delivered, and none
205171169Smlaier               of the compressed data after that block has been consumed,
206171169Smlaier               except for up to seven bits) -- the totout == 0 provides an
207171169Smlaier               entry point after the zlib or gzip header, and assures that the
208171169Smlaier               index always has at least one access point; we avoid creating an
209171169Smlaier               access point after the last block by checking bit 6 of data_type
210171169Smlaier             */
211171169Smlaier            if ((strm.data_type & 128) && !(strm.data_type & 64) &&
212171169Smlaier                (totout == 0 || totout - last > span)) {
213171169Smlaier                index = addpoint(index, strm.data_type & 7, totin,
214171169Smlaier                                 totout, strm.avail_out, window);
215171169Smlaier                if (index == NULL) {
216171169Smlaier                    ret = Z_MEM_ERROR;
217171169Smlaier                    goto build_index_error;
218171169Smlaier                }
219171169Smlaier                last = totout;
220171169Smlaier            }
221171169Smlaier        } while (strm.avail_in != 0);
222171169Smlaier    } while (ret != Z_STREAM_END);
223171169Smlaier
224171169Smlaier    /* clean up and return index (release unused entries in list) */
225171169Smlaier    (void)inflateEnd(&strm);
226171169Smlaier    index = realloc(index, sizeof(struct point) * index->have);
227171169Smlaier    index->size = index->have;
228171169Smlaier    *built = index;
229171169Smlaier    return index->size;
230171169Smlaier
231171169Smlaier    /* return error */
232171169Smlaier  build_index_error:
233171169Smlaier    (void)inflateEnd(&strm);
234171169Smlaier    if (index != NULL)
235171169Smlaier        free_index(index);
236171169Smlaier    return ret;
237171169Smlaier}
238171169Smlaier
239171169Smlaier/* Use the index to read len bytes from offset into buf, return bytes read or
240171169Smlaier   negative for error (Z_DATA_ERROR or Z_MEM_ERROR).  If data is requested past
241171169Smlaier   the end of the uncompressed data, then extract() will return a value less
242171169Smlaier   than len, indicating how much as actually read into buf.  This function
243171169Smlaier   should not return a data error unless the file was modified since the index
244171169Smlaier   was generated.  extract() may also return Z_ERRNO if there is an error on
245171169Smlaier   reading or seeking the input file. */
246171169Smlaierlocal int extract(FILE *in, struct access *index, off_t offset,
247171169Smlaier                  unsigned char *buf, int len)
248171169Smlaier{
249171169Smlaier    int ret, skip;
250171169Smlaier    z_stream strm;
251171169Smlaier    struct point *here;
252171169Smlaier    unsigned char input[CHUNK];
253171169Smlaier    unsigned char discard[WINSIZE];
254171169Smlaier
255171169Smlaier    /* proceed only if something reasonable to do */
256171169Smlaier    if (len < 0)
257171169Smlaier        return 0;
258171169Smlaier
259171169Smlaier    /* find where in stream to start */
260171169Smlaier    here = index->list;
261171169Smlaier    ret = index->have;
262171169Smlaier    while (--ret && here[1].out <= offset)
263171169Smlaier        here++;
264171169Smlaier
265171169Smlaier    /* initialize file and inflate state to start there */
266171169Smlaier    strm.zalloc = Z_NULL;
267171169Smlaier    strm.zfree = Z_NULL;
268171169Smlaier    strm.opaque = Z_NULL;
269171169Smlaier    strm.avail_in = 0;
270171169Smlaier    strm.next_in = Z_NULL;
271171169Smlaier    ret = inflateInit2(&strm, -15);         /* raw inflate */
272171169Smlaier    if (ret != Z_OK)
273171169Smlaier        return ret;
274171169Smlaier    ret = fseeko(in, here->in - (here->bits ? 1 : 0), SEEK_SET);
275171169Smlaier    if (ret == -1)
276171169Smlaier        goto extract_ret;
277171169Smlaier    if (here->bits) {
278171169Smlaier        ret = getc(in);
279171169Smlaier        if (ret == -1) {
280171169Smlaier            ret = ferror(in) ? Z_ERRNO : Z_DATA_ERROR;
281171169Smlaier            goto extract_ret;
282171169Smlaier        }
283171169Smlaier        (void)inflatePrime(&strm, here->bits, ret >> (8 - here->bits));
284171169Smlaier    }
285171169Smlaier    (void)inflateSetDictionary(&strm, here->window, WINSIZE);
286171169Smlaier
287171169Smlaier    /* skip uncompressed bytes until offset reached, then satisfy request */
288171169Smlaier    offset -= here->out;
289171169Smlaier    strm.avail_in = 0;
290171169Smlaier    skip = 1;                               /* while skipping to offset */
291171169Smlaier    do {
292171169Smlaier        /* define where to put uncompressed data, and how much */
293171169Smlaier        if (offset == 0 && skip) {          /* at offset now */
294171169Smlaier            strm.avail_out = len;
295171169Smlaier            strm.next_out = buf;
296171169Smlaier            skip = 0;                       /* only do this once */
297171169Smlaier        }
298171169Smlaier        if (offset > WINSIZE) {             /* skip WINSIZE bytes */
299171169Smlaier            strm.avail_out = WINSIZE;
300171169Smlaier            strm.next_out = discard;
301171169Smlaier            offset -= WINSIZE;
302171169Smlaier        }
303171169Smlaier        else if (offset != 0) {             /* last skip */
304171169Smlaier            strm.avail_out = (unsigned)offset;
305171169Smlaier            strm.next_out = discard;
306171169Smlaier            offset = 0;
307171169Smlaier        }
308171169Smlaier
309171169Smlaier        /* uncompress until avail_out filled, or end of stream */
310171169Smlaier        do {
311171169Smlaier            if (strm.avail_in == 0) {
312171169Smlaier                strm.avail_in = fread(input, 1, CHUNK, in);
313171169Smlaier                if (ferror(in)) {
314171169Smlaier                    ret = Z_ERRNO;
315171169Smlaier                    goto extract_ret;
316171169Smlaier                }
317171169Smlaier                if (strm.avail_in == 0) {
318171169Smlaier                    ret = Z_DATA_ERROR;
319171169Smlaier                    goto extract_ret;
320171169Smlaier                }
321171169Smlaier                strm.next_in = input;
322171169Smlaier            }
323171169Smlaier            ret = inflate(&strm, Z_NO_FLUSH);       /* normal inflate */
324171169Smlaier            if (ret == Z_NEED_DICT)
325171169Smlaier                ret = Z_DATA_ERROR;
326171169Smlaier            if (ret == Z_MEM_ERROR || ret == Z_DATA_ERROR)
327171169Smlaier                goto extract_ret;
328171169Smlaier            if (ret == Z_STREAM_END)
329171169Smlaier                break;
330171169Smlaier        } while (strm.avail_out != 0);
331171169Smlaier
332171169Smlaier        /* if reach end of stream, then don't keep trying to get more */
333171169Smlaier        if (ret == Z_STREAM_END)
334171169Smlaier            break;
335171169Smlaier
336171169Smlaier        /* do until offset reached and requested data read, or stream ends */
337171169Smlaier    } while (skip);
338171169Smlaier
339171169Smlaier    /* compute number of uncompressed bytes read after offset */
340171169Smlaier    ret = skip ? 0 : len - strm.avail_out;
341171169Smlaier
342171169Smlaier    /* clean up and return bytes read or error */
343171169Smlaier  extract_ret:
344171169Smlaier    (void)inflateEnd(&strm);
345171169Smlaier    return ret;
346171169Smlaier}
347171169Smlaier
348171169Smlaier/* Demonstrate the use of build_index() and extract() by processing the file
349171169Smlaier   provided on the command line, and the extracting 16K from about 2/3rds of
350171169Smlaier   the way through the uncompressed output, and writing that to stdout. */
351171169Smlaierint main(int argc, char **argv)
352171169Smlaier{
353171169Smlaier    int len;
354171169Smlaier    off_t offset;
355171169Smlaier    FILE *in;
356171169Smlaier    struct access *index;
357171169Smlaier    unsigned char buf[CHUNK];
358171169Smlaier
359171169Smlaier    /* open input file */
360171169Smlaier    if (argc != 2) {
361171169Smlaier        fprintf(stderr, "usage: zran file.gz\n");
362171169Smlaier        return 1;
363171169Smlaier    }
364171169Smlaier    in = fopen(argv[1], "rb");
365171169Smlaier    if (in == NULL) {
366171169Smlaier        fprintf(stderr, "zran: could not open %s for reading\n", argv[1]);
367171169Smlaier        return 1;
368171169Smlaier    }
369171169Smlaier
370171169Smlaier    /* build index */
371    len = build_index(in, SPAN, &index);
372    if (len < 0) {
373        fclose(in);
374        switch (len) {
375        case Z_MEM_ERROR:
376            fprintf(stderr, "zran: out of memory\n");
377            break;
378        case Z_DATA_ERROR:
379            fprintf(stderr, "zran: compressed data error in %s\n", argv[1]);
380            break;
381        case Z_ERRNO:
382            fprintf(stderr, "zran: read error on %s\n", argv[1]);
383            break;
384        default:
385            fprintf(stderr, "zran: error %d while building index\n", len);
386        }
387        return 1;
388    }
389    fprintf(stderr, "zran: built index with %d access points\n", len);
390
391    /* use index by reading some bytes from an arbitrary offset */
392    offset = (index->list[index->have - 1].out << 1) / 3;
393    len = extract(in, index, offset, buf, CHUNK);
394    if (len < 0)
395        fprintf(stderr, "zran: extraction failed: %s error\n",
396                len == Z_MEM_ERROR ? "out of memory" : "input corrupted");
397    else {
398        fwrite(buf, 1, len, stdout);
399        fprintf(stderr, "zran: extracted %d bytes at %llu\n", len, offset);
400    }
401
402    /* clean up and exit */
403    free_index(index);
404    fclose(in);
405    return 0;
406}
407