archive_write_set_compression_compress.c revision 228753
1133819Stjr/*-
2230132Suqs * Copyright (c) 2008 Joerg Sonnenberger
3133819Stjr * All rights reserved.
4133819Stjr *
5133819Stjr * Redistribution and use in source and binary forms, with or without
6133819Stjr * modification, are permitted provided that the following conditions
7133819Stjr * are met:
8133819Stjr * 1. Redistributions of source code must retain the above copyright
9133819Stjr *    notice, this list of conditions and the following disclaimer.
10133819Stjr * 2. Redistributions in binary form must reproduce the above copyright
11133819Stjr *    notice, this list of conditions and the following disclaimer in the
12133819Stjr *    documentation and/or other materials provided with the distribution.
13133819Stjr *
14133819Stjr * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15133819Stjr * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16133819Stjr * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17133819Stjr * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18133819Stjr * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19133819Stjr * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20133819Stjr * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21133819Stjr * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22133819Stjr * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23133819Stjr * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24133819Stjr */
25133819Stjr
26133819Stjr/*-
27133819Stjr * Copyright (c) 1985, 1986, 1992, 1993
28133819Stjr *	The Regents of the University of California.  All rights reserved.
29133819Stjr *
30133819Stjr * This code is derived from software contributed to Berkeley by
31133819Stjr * Diomidis Spinellis and James A. Woods, derived from original
32235063Snetchild * work by Spencer Thomas and Joseph Orost.
33235063Snetchild *
34235063Snetchild * Redistribution and use in source and binary forms, with or without
35133819Stjr * modification, are permitted provided that the following conditions
36235063Snetchild * are met:
37235063Snetchild * 1. Redistributions of source code must retain the above copyright
38133819Stjr *    notice, this list of conditions and the following disclaimer.
39133819Stjr * 2. Redistributions in binary form must reproduce the above copyright
40133819Stjr *    notice, this list of conditions and the following disclaimer in the
41133819Stjr *    documentation and/or other materials provided with the distribution.
42133819Stjr * 3. Neither the name of the University nor the names of its contributors
43235063Snetchild *    may be used to endorse or promote products derived from this software
44133819Stjr *    without specific prior written permission.
45133819Stjr *
46235063Snetchild * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
47235063Snetchild * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48235063Snetchild * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
49133819Stjr * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
50133819Stjr * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
51133819Stjr * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
52133819Stjr * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53133819Stjr * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
54133819Stjr * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
55133819Stjr * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
56133819Stjr * SUCH DAMAGE.
57133819Stjr */
58133819Stjr
59133819Stjr#include "archive_platform.h"
60133819Stjr
61133819Stjr__FBSDID("$FreeBSD: head/lib/libarchive/archive_write_set_compression_compress.c 201111 2009-12-28 03:33:05Z kientzle $");
62133819Stjr
63133819Stjr#ifdef HAVE_ERRNO_H
64133819Stjr#include <errno.h>
65133819Stjr#endif
66133819Stjr#ifdef HAVE_STDLIB_H
67133819Stjr#include <stdlib.h>
68133819Stjr#endif
69133819Stjr#ifdef HAVE_STRING_H
70133819Stjr#include <string.h>
71133819Stjr#endif
72159824Snetchild
73255675Srdivacky#include "archive.h"
74255675Srdivacky#include "archive_private.h"
75255675Srdivacky#include "archive_write_private.h"
76159824Snetchild
77159824Snetchild#define	HSIZE		69001	/* 95% occupancy */
78159824Snetchild#define	HSHIFT		8	/* 8 - trunc(log2(HSIZE / 65536)) */
79159824Snetchild#define	CHECK_GAP 10000		/* Ratio check interval. */
80159824Snetchild
81159824Snetchild#define	MAXCODE(bits)	((1 << (bits)) - 1)
82159824Snetchild
83159824Snetchild/*
84159824Snetchild * the next two codes should not be changed lightly, as they must not
85159824Snetchild * lie within the contiguous general code space.
86159824Snetchild */
87159824Snetchild#define	FIRST	257		/* First free entry. */
88159824Snetchild#define	CLEAR	256		/* Table clear output code. */
89232799Snetchild
90159824Snetchildstruct private_data {
91159824Snetchild	off_t in_count, out_count, checkpoint;
92159824Snetchild
93232799Snetchild	int code_len;			/* Number of bits/code. */
94159824Snetchild	int cur_maxcode;		/* Maximum code, given n_bits. */
95159824Snetchild	int max_maxcode;		/* Should NEVER generate this code. */
96159824Snetchild	int hashtab [HSIZE];
97159824Snetchild	unsigned short codetab [HSIZE];
98159824Snetchild	int first_free;		/* First unused entry. */
99232799Snetchild	int compress_ratio;
100159824Snetchild
101159824Snetchild	int cur_code, cur_fcode;
102159824Snetchild
103159824Snetchild	int bit_offset;
104232799Snetchild	unsigned char bit_buf;
105178257Sjkim
106178257Sjkim	unsigned char	*compressed;
107178257Sjkim	size_t		 compressed_buffer_size;
108178257Sjkim	size_t		 compressed_offset;
109232799Snetchild};
110232799Snetchild
111232799Snetchildstatic int	archive_compressor_compress_finish(struct archive_write *);
112232799Snetchildstatic int	archive_compressor_compress_init(struct archive_write *);
113232799Snetchildstatic int	archive_compressor_compress_write(struct archive_write *,
114232799Snetchild		    const void *, size_t);
115232799Snetchild
116232799Snetchild/*
117232799Snetchild * Allocate, initialize and return a archive object.
118232799Snetchild */
119232799Snetchildint
120232799Snetchildarchive_write_set_compression_compress(struct archive *_a)
121232799Snetchild{
122232799Snetchild	struct archive_write *a = (struct archive_write *)_a;
123232799Snetchild	__archive_check_magic(&a->archive, ARCHIVE_WRITE_MAGIC,
124232799Snetchild	    ARCHIVE_STATE_NEW, "archive_write_set_compression_compress");
125232799Snetchild	a->compressor.init = &archive_compressor_compress_init;
126232799Snetchild	a->archive.compression_code = ARCHIVE_COMPRESSION_COMPRESS;
127255675Srdivacky	a->archive.compression_name = "compress";
128232799Snetchild	return (ARCHIVE_OK);
129232799Snetchild}
130232799Snetchild
131232799Snetchild/*
132232799Snetchild * Setup callback.
133232799Snetchild */
134232799Snetchildstatic int
135232799Snetchildarchive_compressor_compress_init(struct archive_write *a)
136232799Snetchild{
137232799Snetchild	int ret;
138232799Snetchild	struct private_data *state;
139232799Snetchild
140232799Snetchild	a->archive.compression_code = ARCHIVE_COMPRESSION_COMPRESS;
141232799Snetchild	a->archive.compression_name = "compress";
142232799Snetchild
143232799Snetchild	if (a->bytes_per_block < 4) {
144232799Snetchild		archive_set_error(&a->archive, EINVAL,
145232799Snetchild		    "Can't write Compress header as single block");
146232799Snetchild		return (ARCHIVE_FATAL);
147232799Snetchild	}
148232799Snetchild
149232799Snetchild	if (a->client_opener != NULL) {
150232799Snetchild		ret = (a->client_opener)(&a->archive, a->client_data);
151133819Stjr		if (ret != ARCHIVE_OK)
152133819Stjr			return (ret);
153133819Stjr	}
154133819Stjr
155133819Stjr	state = (struct private_data *)malloc(sizeof(*state));
156133819Stjr	if (state == NULL) {
157133819Stjr		archive_set_error(&a->archive, ENOMEM,
158133819Stjr		    "Can't allocate data for compression");
159133819Stjr		return (ARCHIVE_FATAL);
160133819Stjr	}
161133819Stjr	memset(state, 0, sizeof(*state));
162133819Stjr
163133819Stjr	state->compressed_buffer_size = a->bytes_per_block;
164133819Stjr	state->compressed = malloc(state->compressed_buffer_size);
165133819Stjr
166133819Stjr	if (state->compressed == NULL) {
167133819Stjr		archive_set_error(&a->archive, ENOMEM,
168133819Stjr		    "Can't allocate data for compression buffer");
169133819Stjr		free(state);
170133819Stjr		return (ARCHIVE_FATAL);
171133819Stjr	}
172
173	a->compressor.write = archive_compressor_compress_write;
174	a->compressor.finish = archive_compressor_compress_finish;
175
176	state->max_maxcode = 0x10000;	/* Should NEVER generate this code. */
177	state->in_count = 0;		/* Length of input. */
178	state->bit_buf = 0;
179	state->bit_offset = 0;
180	state->out_count = 3;		/* Includes 3-byte header mojo. */
181	state->compress_ratio = 0;
182	state->checkpoint = CHECK_GAP;
183	state->code_len = 9;
184	state->cur_maxcode = MAXCODE(state->code_len);
185	state->first_free = FIRST;
186
187	memset(state->hashtab, 0xff, sizeof(state->hashtab));
188
189	/* Prime output buffer with a gzip header. */
190	state->compressed[0] = 0x1f; /* Compress */
191	state->compressed[1] = 0x9d;
192	state->compressed[2] = 0x90; /* Block mode, 16bit max */
193	state->compressed_offset = 3;
194
195	a->compressor.data = state;
196	return (0);
197}
198
199/*-
200 * Output the given code.
201 * Inputs:
202 * 	code:	A n_bits-bit integer.  If == -1, then EOF.  This assumes
203 *		that n_bits =< (long)wordsize - 1.
204 * Outputs:
205 * 	Outputs code to the file.
206 * Assumptions:
207 *	Chars are 8 bits long.
208 * Algorithm:
209 * 	Maintain a BITS character long buffer (so that 8 codes will
210 * fit in it exactly).  Use the VAX insv instruction to insert each
211 * code in turn.  When the buffer fills up empty it and start over.
212 */
213
214static unsigned char rmask[9] =
215	{0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff};
216
217static int
218output_byte(struct archive_write *a, unsigned char c)
219{
220	struct private_data *state = a->compressor.data;
221	ssize_t bytes_written;
222
223	state->compressed[state->compressed_offset++] = c;
224	++state->out_count;
225
226	if (state->compressed_buffer_size == state->compressed_offset) {
227		bytes_written = (a->client_writer)(&a->archive,
228		    a->client_data,
229		    state->compressed, state->compressed_buffer_size);
230		if (bytes_written <= 0)
231			return ARCHIVE_FATAL;
232		a->archive.raw_position += bytes_written;
233		state->compressed_offset = 0;
234	}
235
236	return ARCHIVE_OK;
237}
238
239static int
240output_code(struct archive_write *a, int ocode)
241{
242	struct private_data *state = a->compressor.data;
243	int bits, ret, clear_flg, bit_offset;
244
245	clear_flg = ocode == CLEAR;
246
247	/*
248	 * Since ocode is always >= 8 bits, only need to mask the first
249	 * hunk on the left.
250	 */
251	bit_offset = state->bit_offset % 8;
252	state->bit_buf |= (ocode << bit_offset) & 0xff;
253	output_byte(a, state->bit_buf);
254
255	bits = state->code_len - (8 - bit_offset);
256	ocode >>= 8 - bit_offset;
257	/* Get any 8 bit parts in the middle (<=1 for up to 16 bits). */
258	if (bits >= 8) {
259		output_byte(a, ocode & 0xff);
260		ocode >>= 8;
261		bits -= 8;
262	}
263	/* Last bits. */
264	state->bit_offset += state->code_len;
265	state->bit_buf = ocode & rmask[bits];
266	if (state->bit_offset == state->code_len * 8)
267		state->bit_offset = 0;
268
269	/*
270	 * If the next entry is going to be too big for the ocode size,
271	 * then increase it, if possible.
272	 */
273	if (clear_flg || state->first_free > state->cur_maxcode) {
274	       /*
275		* Write the whole buffer, because the input side won't
276		* discover the size increase until after it has read it.
277		*/
278		if (state->bit_offset > 0) {
279			while (state->bit_offset < state->code_len * 8) {
280				ret = output_byte(a, state->bit_buf);
281				if (ret != ARCHIVE_OK)
282					return ret;
283				state->bit_offset += 8;
284				state->bit_buf = 0;
285			}
286		}
287		state->bit_buf = 0;
288		state->bit_offset = 0;
289
290		if (clear_flg) {
291			state->code_len = 9;
292			state->cur_maxcode = MAXCODE(state->code_len);
293		} else {
294			state->code_len++;
295			if (state->code_len == 16)
296				state->cur_maxcode = state->max_maxcode;
297			else
298				state->cur_maxcode = MAXCODE(state->code_len);
299		}
300	}
301
302	return (ARCHIVE_OK);
303}
304
305static int
306output_flush(struct archive_write *a)
307{
308	struct private_data *state = a->compressor.data;
309	int ret;
310
311	/* At EOF, write the rest of the buffer. */
312	if (state->bit_offset % 8) {
313		state->code_len = (state->bit_offset % 8 + 7) / 8;
314		ret = output_byte(a, state->bit_buf);
315		if (ret != ARCHIVE_OK)
316			return ret;
317	}
318
319	return (ARCHIVE_OK);
320}
321
322/*
323 * Write data to the compressed stream.
324 */
325static int
326archive_compressor_compress_write(struct archive_write *a, const void *buff,
327    size_t length)
328{
329	struct private_data *state;
330	int i;
331	int ratio;
332	int c, disp, ret;
333	const unsigned char *bp;
334
335	state = (struct private_data *)a->compressor.data;
336	if (a->client_writer == NULL) {
337		archive_set_error(&a->archive, ARCHIVE_ERRNO_PROGRAMMER,
338		    "No write callback is registered?  "
339		    "This is probably an internal programming error.");
340		return (ARCHIVE_FATAL);
341	}
342
343	if (length == 0)
344		return ARCHIVE_OK;
345
346	bp = buff;
347
348	if (state->in_count == 0) {
349		state->cur_code = *bp++;
350		++state->in_count;
351		--length;
352	}
353
354	while (length--) {
355		c = *bp++;
356		state->in_count++;
357		state->cur_fcode = (c << 16) + state->cur_code;
358		i = ((c << HSHIFT) ^ state->cur_code);	/* Xor hashing. */
359
360		if (state->hashtab[i] == state->cur_fcode) {
361			state->cur_code = state->codetab[i];
362			continue;
363		}
364		if (state->hashtab[i] < 0)	/* Empty slot. */
365			goto nomatch;
366		/* Secondary hash (after G. Knott). */
367		if (i == 0)
368			disp = 1;
369		else
370			disp = HSIZE - i;
371 probe:
372		if ((i -= disp) < 0)
373			i += HSIZE;
374
375		if (state->hashtab[i] == state->cur_fcode) {
376			state->cur_code = state->codetab[i];
377			continue;
378		}
379		if (state->hashtab[i] >= 0)
380			goto probe;
381 nomatch:
382		ret = output_code(a, state->cur_code);
383		if (ret != ARCHIVE_OK)
384			return ret;
385		state->cur_code = c;
386		if (state->first_free < state->max_maxcode) {
387			state->codetab[i] = state->first_free++;	/* code -> hashtable */
388			state->hashtab[i] = state->cur_fcode;
389			continue;
390		}
391		if (state->in_count < state->checkpoint)
392			continue;
393
394		state->checkpoint = state->in_count + CHECK_GAP;
395
396		if (state->in_count <= 0x007fffff)
397			ratio = state->in_count * 256 / state->out_count;
398		else if ((ratio = state->out_count / 256) == 0)
399			ratio = 0x7fffffff;
400		else
401			ratio = state->in_count / ratio;
402
403		if (ratio > state->compress_ratio)
404			state->compress_ratio = ratio;
405		else {
406			state->compress_ratio = 0;
407			memset(state->hashtab, 0xff, sizeof(state->hashtab));
408			state->first_free = FIRST;
409			ret = output_code(a, CLEAR);
410			if (ret != ARCHIVE_OK)
411				return ret;
412		}
413	}
414
415	return (ARCHIVE_OK);
416}
417
418
419/*
420 * Finish the compression...
421 */
422static int
423archive_compressor_compress_finish(struct archive_write *a)
424{
425	ssize_t block_length, target_block_length, bytes_written;
426	int ret;
427	struct private_data *state;
428	size_t tocopy;
429
430	state = (struct private_data *)a->compressor.data;
431	if (a->client_writer == NULL) {
432		archive_set_error(&a->archive, ARCHIVE_ERRNO_PROGRAMMER,
433		    "No write callback is registered?  "
434		    "This is probably an internal programming error.");
435		ret = ARCHIVE_FATAL;
436		goto cleanup;
437	}
438
439	/* By default, always pad the uncompressed data. */
440	if (a->pad_uncompressed) {
441		while (state->in_count % a->bytes_per_block != 0) {
442			tocopy = a->bytes_per_block -
443			    (state->in_count % a->bytes_per_block);
444			if (tocopy > a->null_length)
445				tocopy = a->null_length;
446			ret = archive_compressor_compress_write(a, a->nulls,
447			    tocopy);
448			if (ret != ARCHIVE_OK)
449				goto cleanup;
450		}
451	}
452
453	ret = output_code(a, state->cur_code);
454	if (ret != ARCHIVE_OK)
455		goto cleanup;
456	ret = output_flush(a);
457	if (ret != ARCHIVE_OK)
458		goto cleanup;
459
460	/* Optionally, pad the final compressed block. */
461	block_length = state->compressed_offset;
462
463	/* Tricky calculation to determine size of last block. */
464	if (a->bytes_in_last_block <= 0)
465		/* Default or Zero: pad to full block */
466		target_block_length = a->bytes_per_block;
467	else
468		/* Round length to next multiple of bytes_in_last_block. */
469		target_block_length = a->bytes_in_last_block *
470		    ( (block_length + a->bytes_in_last_block - 1) /
471			a->bytes_in_last_block);
472	if (target_block_length > a->bytes_per_block)
473		target_block_length = a->bytes_per_block;
474	if (block_length < target_block_length) {
475		memset(state->compressed + state->compressed_offset, 0,
476		    target_block_length - block_length);
477		block_length = target_block_length;
478	}
479
480	/* Write the last block */
481	bytes_written = (a->client_writer)(&a->archive, a->client_data,
482	    state->compressed, block_length);
483	if (bytes_written <= 0)
484		ret = ARCHIVE_FATAL;
485	else
486		a->archive.raw_position += bytes_written;
487
488cleanup:
489	free(state->compressed);
490	free(state);
491	return (ret);
492}
493