1194579Sdelphij/*-
2194579Sdelphij * Copyright (c) 2009 Xin LI <delphij@FreeBSD.org>
3194579Sdelphij * All rights reserved.
4194579Sdelphij *
5194579Sdelphij * Redistribution and use in source and binary forms, with or without
6194579Sdelphij * modification, are permitted provided that the following conditions
7194579Sdelphij * are met:
8194579Sdelphij * 1. Redistributions of source code must retain the above copyright
9194579Sdelphij *    notice, this list of conditions and the following disclaimer.
10194579Sdelphij * 2. Redistributions in binary form must reproduce the above copyright
11194579Sdelphij *    notice, this list of conditions and the following disclaimer in the
12194579Sdelphij *    documentation and/or other materials provided with the distribution.
13194579Sdelphij *
14194579Sdelphij * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15194579Sdelphij * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16194579Sdelphij * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17194579Sdelphij * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18194579Sdelphij * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19194579Sdelphij * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20194579Sdelphij * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21194579Sdelphij * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22194579Sdelphij * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23194579Sdelphij * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24194579Sdelphij * SUCH DAMAGE.
25194579Sdelphij *
26194579Sdelphij * $FreeBSD: releng/10.3/usr.bin/gzip/unpack.c 213927 2010-10-16 15:24:04Z bcr $
27194579Sdelphij */
28194579Sdelphij
29194579Sdelphij/* This file is #included by gzip.c */
30194579Sdelphij
31194579Sdelphij/*
32194579Sdelphij * pack(1) file format:
33194579Sdelphij *
34194579Sdelphij * The first 7 bytes is the header:
35194579Sdelphij *	00, 01 - Signature (US, RS), we already validated it earlier.
36194579Sdelphij *	02..05 - Uncompressed size
37194579Sdelphij *	    06 - Level for the huffman tree (<=24)
38194579Sdelphij *
39194579Sdelphij * pack(1) will then store symbols (leaf) nodes count in each huffman
40194579Sdelphij * tree levels, each level would consume 1 byte (See [1]).
41194579Sdelphij *
42194579Sdelphij * After the symbol count table, there is the symbol table, storing
43213927Sbcr * symbols represented by corresponding leaf node.  EOB is not being
44194579Sdelphij * explicitly transmitted (not necessary anyway) in the symbol table.
45194579Sdelphij *
46194579Sdelphij * Compressed data goes after the symbol table.
47194579Sdelphij *
48194579Sdelphij * NOTES
49194579Sdelphij *
50194579Sdelphij * [1] If we count EOB into the symbols, that would mean that we will
51194579Sdelphij * have at most 256 symbols in the huffman tree.  pack(1) rejects empty
52194579Sdelphij * file and files that just repeats one character, which means that we
53194579Sdelphij * will have at least 2 symbols.  Therefore, pack(1) would reduce the
54194579Sdelphij * last level symbol count by 2 which makes it a number in
55194579Sdelphij * range [0..254], so all levels' symbol count would fit into 1 byte.
56194579Sdelphij */
57194579Sdelphij
58194579Sdelphij#define	PACK_HEADER_LENGTH	7
59194579Sdelphij#define	HTREE_MAXLEVEL		24
60194579Sdelphij
61194579Sdelphij/*
62194579Sdelphij * unpack descriptor
63194579Sdelphij *
64213927Sbcr * Represent the huffman tree in a similar way that pack(1) would
65194579Sdelphij * store in a packed file.  We store all symbols in a linear table,
66194579Sdelphij * and store pointers to each level's first symbol.  In addition to
67194579Sdelphij * that, maintain two counts for each level: inner nodes count and
68194579Sdelphij * leaf nodes count.
69194579Sdelphij */
70194579Sdelphijtypedef struct {
71194579Sdelphij	int		symbol_size;	/* Size of the symbol table */
72194579Sdelphij	int		treelevels;	/* Levels for the huffman tree */
73194579Sdelphij
74194579Sdelphij	int		*symbolsin;	/* Table of leaf symbols count in
75194579Sdelphij					   each level */
76194579Sdelphij	int		*inodesin;	/* Table of internal nodes count in
77194579Sdelphij					   each level */
78194579Sdelphij
79194579Sdelphij	char		*symbol;	/* The symbol table */
80194579Sdelphij	char		*symbol_eob;	/* Pointer to the EOB symbol */
81194579Sdelphij	char		**tree;		/* Decoding huffman tree (pointers to
82194579Sdelphij					   first symbol of each tree level */
83194579Sdelphij
84194579Sdelphij	off_t		uncompressed_size; /* Uncompressed size */
85194579Sdelphij	FILE		*fpIn;		/* Input stream */
86194579Sdelphij	FILE		*fpOut;		/* Output stream */
87194579Sdelphij} unpack_descriptor_t;
88194579Sdelphij
89194579Sdelphij/*
90194579Sdelphij * Release resource allocated to an unpack descriptor.
91194579Sdelphij *
92194579Sdelphij * Caller is responsible to make sure that all of these pointers are
93194579Sdelphij * initialized (in our case, they all point to valid memory block).
94194579Sdelphij * We don't zero out pointers here because nobody else would ever
95213927Sbcr * reference the memory block without scrubbing them.
96194579Sdelphij */
97194579Sdelphijstatic void
98194579Sdelphijunpack_descriptor_fini(unpack_descriptor_t *unpackd)
99194579Sdelphij{
100194579Sdelphij
101194579Sdelphij	free(unpackd->symbolsin);
102194579Sdelphij	free(unpackd->inodesin);
103194579Sdelphij	free(unpackd->symbol);
104194579Sdelphij	free(unpackd->tree);
105194579Sdelphij
106194579Sdelphij	fclose(unpackd->fpIn);
107194579Sdelphij	fclose(unpackd->fpOut);
108194579Sdelphij}
109194579Sdelphij
110194579Sdelphij/*
111194579Sdelphij * Recursively fill the internal node count table
112194579Sdelphij */
113194579Sdelphijstatic void
114194579Sdelphijunpackd_fill_inodesin(const unpack_descriptor_t *unpackd, int level)
115194579Sdelphij{
116194579Sdelphij
117194579Sdelphij	/*
118194579Sdelphij	 * The internal nodes would be 1/2 of total internal nodes and
119194579Sdelphij	 * leaf nodes in the next level.  For the last level there
120213927Sbcr	 * would be no internal node by definition.
121194579Sdelphij	 */
122194579Sdelphij	if (level < unpackd->treelevels) {
123194579Sdelphij		unpackd_fill_inodesin(unpackd, level + 1);
124194579Sdelphij		unpackd->inodesin[level] = (unpackd->inodesin[level + 1] +
125194579Sdelphij					  unpackd->symbolsin[level + 1]) / 2;
126194579Sdelphij	} else
127194579Sdelphij		unpackd->inodesin[level] = 0;
128194579Sdelphij}
129194579Sdelphij
130194579Sdelphij/*
131194579Sdelphij * Update counter for accepted bytes
132194579Sdelphij */
133194579Sdelphijstatic void
134194579Sdelphijaccepted_bytes(off_t *bytes_in, off_t newbytes)
135194579Sdelphij{
136194579Sdelphij
137194579Sdelphij	if (bytes_in != NULL)
138194579Sdelphij		(*bytes_in) += newbytes;
139194579Sdelphij}
140194579Sdelphij
141194579Sdelphij/*
142194579Sdelphij * Read file header and construct the tree.  Also, prepare the buffered I/O
143213927Sbcr * for decode routine.
144194579Sdelphij *
145194579Sdelphij * Return value is uncompressed size.
146194579Sdelphij */
147194579Sdelphijstatic void
148194579Sdelphijunpack_parse_header(int in, int out, char *pre, size_t prelen, off_t *bytes_in,
149194579Sdelphij    unpack_descriptor_t *unpackd)
150194579Sdelphij{
151194579Sdelphij	unsigned char hdr[PACK_HEADER_LENGTH];	/* buffer for header */
152194579Sdelphij	ssize_t bytesread;		/* Bytes read from the file */
153194579Sdelphij	int i, j, thisbyte;
154194579Sdelphij
155194579Sdelphij	/* Prepend the header buffer if we already read some data */
156194579Sdelphij	if (prelen != 0)
157194579Sdelphij		memcpy(hdr, pre, prelen);
158194579Sdelphij
159194579Sdelphij	/* Read in and fill the rest bytes of header */
160194579Sdelphij	bytesread = read(in, hdr + prelen, PACK_HEADER_LENGTH - prelen);
161194579Sdelphij	if (bytesread < 0)
162194579Sdelphij		maybe_err("Error reading pack header");
163194579Sdelphij
164194579Sdelphij	accepted_bytes(bytes_in, PACK_HEADER_LENGTH);
165194579Sdelphij
166194579Sdelphij	/* Obtain uncompressed length (bytes 2,3,4,5)*/
167194579Sdelphij	unpackd->uncompressed_size = 0;
168194579Sdelphij	for (i = 2; i <= 5; i++) {
169194579Sdelphij		unpackd->uncompressed_size <<= 8;
170194579Sdelphij		unpackd->uncompressed_size |= hdr[i];
171194579Sdelphij	}
172194579Sdelphij
173194579Sdelphij	/* Get the levels of the tree */
174194579Sdelphij	unpackd->treelevels = hdr[6];
175194579Sdelphij	if (unpackd->treelevels > HTREE_MAXLEVEL || unpackd->treelevels < 1)
176194579Sdelphij		maybe_errx("Huffman tree has insane levels");
177194579Sdelphij
178194579Sdelphij	/* Let libc take care for buffering from now on */
179194579Sdelphij	if ((unpackd->fpIn = fdopen(in, "r")) == NULL)
180194579Sdelphij		maybe_err("Can not fdopen() input stream");
181194579Sdelphij	if ((unpackd->fpOut = fdopen(out, "w")) == NULL)
182194579Sdelphij		maybe_err("Can not fdopen() output stream");
183194579Sdelphij
184194579Sdelphij	/* Allocate for the tables of bounds and the tree itself */
185194579Sdelphij	unpackd->inodesin =
186194579Sdelphij	    calloc(unpackd->treelevels, sizeof(*(unpackd->inodesin)));
187194579Sdelphij	unpackd->symbolsin =
188194579Sdelphij	    calloc(unpackd->treelevels, sizeof(*(unpackd->symbolsin)));
189194579Sdelphij	unpackd->tree =
190194579Sdelphij	    calloc(unpackd->treelevels, (sizeof (*(unpackd->tree))));
191194579Sdelphij	if (unpackd->inodesin == NULL || unpackd->symbolsin == NULL ||
192194579Sdelphij	    unpackd->tree == NULL)
193194579Sdelphij		maybe_err("calloc");
194194579Sdelphij
195194579Sdelphij	/* We count from 0 so adjust to match array upper bound */
196194579Sdelphij	unpackd->treelevels--;
197194579Sdelphij
198213927Sbcr	/* Read the levels symbol count table and calculate total */
199194579Sdelphij	unpackd->symbol_size = 1;		/* EOB */
200194579Sdelphij	for (i = 0; i <= unpackd->treelevels; i++) {
201194579Sdelphij		if ((thisbyte = fgetc(unpackd->fpIn)) == EOF)
202194579Sdelphij			maybe_err("File appears to be truncated");
203194579Sdelphij		unpackd->symbolsin[i] = (unsigned char)thisbyte;
204194579Sdelphij		unpackd->symbol_size += unpackd->symbolsin[i];
205194579Sdelphij	}
206194579Sdelphij	accepted_bytes(bytes_in, unpackd->treelevels);
207194579Sdelphij	if (unpackd->symbol_size > 256)
208194579Sdelphij		maybe_errx("Bad symbol table");
209194579Sdelphij
210194579Sdelphij	/* Allocate for the symbol table, point symbol_eob at the beginning */
211194579Sdelphij	unpackd->symbol_eob = unpackd->symbol = calloc(1, unpackd->symbol_size);
212194579Sdelphij	if (unpackd->symbol == NULL)
213194579Sdelphij		maybe_err("calloc");
214194579Sdelphij
215194579Sdelphij	/*
216194579Sdelphij	 * Read in the symbol table, which contain [2, 256] symbols.
217194579Sdelphij	 * In order to fit the count in one byte, pack(1) would offset
218194579Sdelphij	 * it by reducing 2 from the actual number from the last level.
219194579Sdelphij	 *
220194579Sdelphij	 * We adjust the last level's symbol count by 1 here, because
221194579Sdelphij	 * the EOB symbol is not being transmitted explicitly.  Another
222194579Sdelphij	 * adjustment would be done later afterward.
223194579Sdelphij	 */
224194579Sdelphij	unpackd->symbolsin[unpackd->treelevels]++;
225194579Sdelphij	for (i = 0; i <= unpackd->treelevels; i++) {
226194579Sdelphij		unpackd->tree[i] = unpackd->symbol_eob;
227194579Sdelphij		for (j = 0; j < unpackd->symbolsin[i]; j++) {
228194579Sdelphij			if ((thisbyte = fgetc(unpackd->fpIn)) == EOF)
229194579Sdelphij				maybe_errx("Symbol table truncated");
230194579Sdelphij			*unpackd->symbol_eob++ = (char)thisbyte;
231194579Sdelphij		}
232194579Sdelphij		accepted_bytes(bytes_in, unpackd->symbolsin[i]);
233194579Sdelphij	}
234194579Sdelphij
235194579Sdelphij	/* Now, take account for the EOB symbol as well */
236194579Sdelphij	unpackd->symbolsin[unpackd->treelevels]++;
237194579Sdelphij
238194579Sdelphij	/*
239194579Sdelphij	 * The symbolsin table has been constructed now.
240213927Sbcr	 * Calculate the internal nodes count table based on it.
241194579Sdelphij	 */
242194579Sdelphij	unpackd_fill_inodesin(unpackd, 0);
243194579Sdelphij}
244194579Sdelphij
245194579Sdelphij/*
246194579Sdelphij * Decode huffman stream, based on the huffman tree.
247194579Sdelphij */
248194579Sdelphijstatic void
249194579Sdelphijunpack_decode(const unpack_descriptor_t *unpackd, off_t *bytes_in)
250194579Sdelphij{
251194579Sdelphij	int thislevel, thiscode, thisbyte, inlevelindex;
252194579Sdelphij	int i;
253194579Sdelphij	off_t bytes_out = 0;
254194579Sdelphij	const char *thissymbol;	/* The symbol pointer decoded from stream */
255194579Sdelphij
256194579Sdelphij	/*
257194579Sdelphij	 * Decode huffman.  Fetch every bytes from the file, get it
258194579Sdelphij	 * into 'thiscode' bit-by-bit, then output the symbol we got
259194579Sdelphij	 * when one has been found.
260194579Sdelphij	 *
261194579Sdelphij	 * Assumption: sizeof(int) > ((max tree levels + 1) / 8).
262194579Sdelphij	 * bad things could happen if not.
263194579Sdelphij	 */
264194579Sdelphij	thislevel = 0;
265194579Sdelphij	thiscode = thisbyte = 0;
266194579Sdelphij
267194579Sdelphij	while ((thisbyte = fgetc(unpackd->fpIn)) != EOF) {
268194579Sdelphij		accepted_bytes(bytes_in, 1);
269194579Sdelphij
270194579Sdelphij		/*
271194579Sdelphij		 * Split one bit from thisbyte, from highest to lowest,
272194579Sdelphij		 * feed the bit into thiscode, until we got a symbol from
273194579Sdelphij		 * the tree.
274194579Sdelphij		 */
275194579Sdelphij		for (i = 7; i >= 0; i--) {
276194579Sdelphij			thiscode = (thiscode << 1) | ((thisbyte >> i) & 1);
277194579Sdelphij
278194579Sdelphij			/* Did we got a symbol? (referencing leaf node) */
279194579Sdelphij			if (thiscode >= unpackd->inodesin[thislevel]) {
280194579Sdelphij				inlevelindex =
281194579Sdelphij				    thiscode - unpackd->inodesin[thislevel];
282194579Sdelphij				if (inlevelindex > unpackd->symbolsin[thislevel])
283194579Sdelphij					maybe_errx("File corrupt");
284194579Sdelphij
285194579Sdelphij				thissymbol =
286194579Sdelphij				    &(unpackd->tree[thislevel][inlevelindex]);
287194579Sdelphij				if ((thissymbol == unpackd->symbol_eob) &&
288194579Sdelphij				    (bytes_out == unpackd->uncompressed_size))
289194579Sdelphij					goto finished;
290194579Sdelphij
291194579Sdelphij				fputc((*thissymbol), unpackd->fpOut);
292194579Sdelphij				bytes_out++;
293194579Sdelphij
294194579Sdelphij				/* Prepare for next input */
295194579Sdelphij				thislevel = 0; thiscode = 0;
296194579Sdelphij			} else {
297194579Sdelphij				thislevel++;
298194579Sdelphij				if (thislevel > unpackd->treelevels)
299194579Sdelphij					maybe_errx("File corrupt");
300194579Sdelphij			}
301194579Sdelphij		}
302194579Sdelphij	}
303194579Sdelphij
304194579Sdelphijfinished:
305194579Sdelphij	if (bytes_out != unpackd->uncompressed_size)
306194579Sdelphij		maybe_errx("Premature EOF");
307194579Sdelphij}
308194579Sdelphij
309194579Sdelphij/* Handler for pack(1)'ed file */
310194579Sdelphijstatic off_t
311194579Sdelphijunpack(int in, int out, char *pre, size_t prelen, off_t *bytes_in)
312194579Sdelphij{
313194579Sdelphij	unpack_descriptor_t	unpackd;
314194579Sdelphij
315211475Sdelphij	in = dup(in);
316211475Sdelphij	if (in == -1)
317211475Sdelphij		maybe_err("dup");
318211475Sdelphij	out = dup(out);
319211475Sdelphij	if (out == -1)
320211475Sdelphij		maybe_err("dup");
321211475Sdelphij
322211475Sdelphij	unpack_parse_header(in, out, pre, prelen, bytes_in, &unpackd);
323194579Sdelphij	unpack_decode(&unpackd, bytes_in);
324194579Sdelphij	unpack_descriptor_fini(&unpackd);
325194579Sdelphij
326194579Sdelphij	/* If we reached here, the unpack was successful */
327194579Sdelphij	return (unpackd.uncompressed_size);
328194579Sdelphij}
329194579Sdelphij
330