1/*	$NetBSD: unxz.c,v 1.8 2018/10/06 16:36:45 martin Exp $	*/
2
3/*-
4 * SPDX-License-Identifier: BSD-2-Clause
5 *
6 * Copyright (c) 2011 The NetBSD Foundation, Inc.
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to The NetBSD Foundation
10 * by Christos Zoulas.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 *    notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 *    notice, this list of conditions and the following disclaimer in the
19 *    documentation and/or other materials provided with the distribution.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
25 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33#include <sys/cdefs.h>
34#include <stdarg.h>
35#include <errno.h>
36#include <stdio.h>
37#include <unistd.h>
38#include <lzma.h>
39
40static off_t
41unxz(int i, int o, char *pre, size_t prelen, off_t *bytes_in)
42{
43	lzma_stream strm = LZMA_STREAM_INIT;
44	static const int flags = LZMA_TELL_UNSUPPORTED_CHECK|LZMA_CONCATENATED;
45	lzma_ret ret;
46	lzma_action action = LZMA_RUN;
47	off_t bytes_out, bp;
48	uint8_t ibuf[BUFSIZ];
49	uint8_t obuf[BUFSIZ];
50
51	if (bytes_in == NULL)
52		bytes_in = &bp;
53
54	strm.next_in = ibuf;
55	memcpy(ibuf, pre, prelen);
56	strm.avail_in = read(i, ibuf + prelen, sizeof(ibuf) - prelen);
57	if (strm.avail_in == (size_t)-1)
58		maybe_err("read failed");
59	infile_newdata(strm.avail_in);
60	strm.avail_in += prelen;
61	*bytes_in = strm.avail_in;
62
63	if ((ret = lzma_stream_decoder(&strm, UINT64_MAX, flags)) != LZMA_OK)
64		maybe_errx("Can't initialize decoder (%d)", ret);
65
66	strm.next_out = NULL;
67	strm.avail_out = 0;
68	if ((ret = lzma_code(&strm, LZMA_RUN)) != LZMA_OK)
69		maybe_errx("Can't read headers (%d)", ret);
70
71	bytes_out = 0;
72	strm.next_out = obuf;
73	strm.avail_out = sizeof(obuf);
74
75	for (;;) {
76		check_siginfo();
77		if (strm.avail_in == 0) {
78			strm.next_in = ibuf;
79			strm.avail_in = read(i, ibuf, sizeof(ibuf));
80			switch (strm.avail_in) {
81			case (size_t)-1:
82				maybe_err("read failed");
83				/*NOTREACHED*/
84			case 0:
85				action = LZMA_FINISH;
86				break;
87			default:
88				infile_newdata(strm.avail_in);
89				*bytes_in += strm.avail_in;
90				break;
91			}
92		}
93
94		ret = lzma_code(&strm, action);
95
96		// Write and check write error before checking decoder error.
97		// This way as much data as possible gets written to output
98		// even if decoder detected an error.
99		if (strm.avail_out == 0 || ret != LZMA_OK) {
100			const size_t write_size = sizeof(obuf) - strm.avail_out;
101
102			if (write(o, obuf, write_size) != (ssize_t)write_size)
103				maybe_err("write failed");
104
105			strm.next_out = obuf;
106			strm.avail_out = sizeof(obuf);
107			bytes_out += write_size;
108		}
109
110		if (ret != LZMA_OK) {
111			if (ret == LZMA_STREAM_END) {
112				// Check that there's no trailing garbage.
113				if (strm.avail_in != 0 || read(i, ibuf, 1))
114					ret = LZMA_DATA_ERROR;
115				else {
116					lzma_end(&strm);
117					return bytes_out;
118				}
119			}
120
121			const char *msg;
122			switch (ret) {
123			case LZMA_MEM_ERROR:
124				msg = strerror(ENOMEM);
125				break;
126
127			case LZMA_FORMAT_ERROR:
128				msg = "File format not recognized";
129				break;
130
131			case LZMA_OPTIONS_ERROR:
132				// FIXME: Better message?
133				msg = "Unsupported compression options";
134				break;
135
136			case LZMA_DATA_ERROR:
137				msg = "File is corrupt";
138				break;
139
140			case LZMA_BUF_ERROR:
141				msg = "Unexpected end of input";
142				break;
143
144			case LZMA_MEMLIMIT_ERROR:
145				msg = "Reached memory limit";
146				break;
147
148			default:
149				maybe_errx("Unknown error (%d)", ret);
150				break;
151			}
152			maybe_errx("%s", msg);
153
154		}
155	}
156}
157
158#include <stdbool.h>
159
160/*
161 * Copied various bits and pieces from xz support code or brute force
162 * replacements.
163 */
164
165#define	my_min(A,B)	((A)<(B)?(A):(B))
166
167// Some systems have suboptimal BUFSIZ. Use a bit bigger value on them.
168// We also need that IO_BUFFER_SIZE is a multiple of 8 (sizeof(uint64_t))
169#if BUFSIZ <= 1024
170#       define IO_BUFFER_SIZE 8192
171#else
172#       define IO_BUFFER_SIZE (BUFSIZ & ~7U)
173#endif
174
175/// is_sparse() accesses the buffer as uint64_t for maximum speed.
176/// Use an union to make sure that the buffer is properly aligned.
177typedef union {
178        uint8_t u8[IO_BUFFER_SIZE];
179        uint32_t u32[IO_BUFFER_SIZE / sizeof(uint32_t)];
180        uint64_t u64[IO_BUFFER_SIZE / sizeof(uint64_t)];
181} io_buf;
182
183
184static bool
185io_pread(int fd, io_buf *buf, size_t size, off_t pos)
186{
187	// Using lseek() and read() is more portable than pread() and
188	// for us it is as good as real pread().
189	if (lseek(fd, pos, SEEK_SET) != pos) {
190		return true;
191	}
192
193	const size_t amount = read(fd, buf, size);
194	if (amount == SIZE_MAX)
195		return true;
196
197	if (amount != size) {
198		return true;
199	}
200
201	return false;
202}
203
204/*
205 * Most of the following is copied (mostly verbatim) from the xz
206 * distribution, from file src/xz/list.c
207 */
208
209///////////////////////////////////////////////////////////////////////////////
210//
211/// \file       list.c
212/// \brief      Listing information about .xz files
213//
214//  Author:     Lasse Collin
215//
216//  This file has been put into the public domain.
217//  You can do whatever you want with this file.
218//
219///////////////////////////////////////////////////////////////////////////////
220
221
222/// Information about a .xz file
223typedef struct {
224	/// Combined Index of all Streams in the file
225	lzma_index *idx;
226
227	/// Total amount of Stream Padding
228	uint64_t stream_padding;
229
230	/// Highest memory usage so far
231	uint64_t memusage_max;
232
233	/// True if all Blocks so far have Compressed Size and
234	/// Uncompressed Size fields
235	bool all_have_sizes;
236
237	/// Oldest XZ Utils version that will decompress the file
238	uint32_t min_version;
239
240} xz_file_info;
241
242#define XZ_FILE_INFO_INIT { NULL, 0, 0, true, 50000002 }
243
244
245/// \brief      Parse the Index(es) from the given .xz file
246///
247/// \param      xfi     Pointer to structure where the decoded information
248///                     is stored.
249/// \param      pair    Input file
250///
251/// \return     On success, false is returned. On error, true is returned.
252///
253// TODO: This function is pretty big. liblzma should have a function that
254// takes a callback function to parse the Index(es) from a .xz file to make
255// it easy for applications.
256static bool
257parse_indexes(xz_file_info *xfi, int src_fd)
258{
259	struct stat st;
260
261	if (fstat(src_fd, &st) != 0) {
262		return true;
263	}
264
265	if (st.st_size < 2 * LZMA_STREAM_HEADER_SIZE) {
266		return true;
267	}
268
269	io_buf buf;
270	lzma_stream_flags header_flags;
271	lzma_stream_flags footer_flags;
272	lzma_ret ret;
273
274	// lzma_stream for the Index decoder
275	lzma_stream strm = LZMA_STREAM_INIT;
276
277	// All Indexes decoded so far
278	lzma_index *combined_index = NULL;
279
280	// The Index currently being decoded
281	lzma_index *this_index = NULL;
282
283	// Current position in the file. We parse the file backwards so
284	// initialize it to point to the end of the file.
285	off_t pos = st.st_size;
286
287	// Each loop iteration decodes one Index.
288	do {
289		// Check that there is enough data left to contain at least
290		// the Stream Header and Stream Footer. This check cannot
291		// fail in the first pass of this loop.
292		if (pos < 2 * LZMA_STREAM_HEADER_SIZE) {
293			goto error;
294		}
295
296		pos -= LZMA_STREAM_HEADER_SIZE;
297		lzma_vli stream_padding = 0;
298
299		// Locate the Stream Footer. There may be Stream Padding which
300		// we must skip when reading backwards.
301		while (true) {
302			if (pos < LZMA_STREAM_HEADER_SIZE) {
303				goto error;
304			}
305
306			if (io_pread(src_fd, &buf,
307					LZMA_STREAM_HEADER_SIZE, pos))
308				goto error;
309
310			// Stream Padding is always a multiple of four bytes.
311			int i = 2;
312			if (buf.u32[i] != 0)
313				break;
314
315			// To avoid calling io_pread() for every four bytes
316			// of Stream Padding, take advantage that we read
317			// 12 bytes (LZMA_STREAM_HEADER_SIZE) already and
318			// check them too before calling io_pread() again.
319			do {
320				stream_padding += 4;
321				pos -= 4;
322				--i;
323			} while (i >= 0 && buf.u32[i] == 0);
324		}
325
326		// Decode the Stream Footer.
327		ret = lzma_stream_footer_decode(&footer_flags, buf.u8);
328		if (ret != LZMA_OK) {
329			goto error;
330		}
331
332		// Check that the Stream Footer doesn't specify something
333		// that we don't support. This can only happen if the xz
334		// version is older than liblzma and liblzma supports
335		// something new.
336		//
337		// It is enough to check Stream Footer. Stream Header must
338		// match when it is compared against Stream Footer with
339		// lzma_stream_flags_compare().
340		if (footer_flags.version != 0) {
341			goto error;
342		}
343
344		// Check that the size of the Index field looks sane.
345		lzma_vli index_size = footer_flags.backward_size;
346		if ((lzma_vli)(pos) < index_size + LZMA_STREAM_HEADER_SIZE) {
347			goto error;
348		}
349
350		// Set pos to the beginning of the Index.
351		pos -= index_size;
352
353		// Decode the Index.
354		ret = lzma_index_decoder(&strm, &this_index, UINT64_MAX);
355		if (ret != LZMA_OK) {
356			goto error;
357		}
358
359		do {
360			// Don't give the decoder more input than the
361			// Index size.
362			strm.avail_in = my_min(IO_BUFFER_SIZE, index_size);
363			if (io_pread(src_fd, &buf, strm.avail_in, pos))
364				goto error;
365
366			pos += strm.avail_in;
367			index_size -= strm.avail_in;
368
369			strm.next_in = buf.u8;
370			ret = lzma_code(&strm, LZMA_RUN);
371
372		} while (ret == LZMA_OK);
373
374		// If the decoding seems to be successful, check also that
375		// the Index decoder consumed as much input as indicated
376		// by the Backward Size field.
377		if (ret == LZMA_STREAM_END)
378			if (index_size != 0 || strm.avail_in != 0)
379				ret = LZMA_DATA_ERROR;
380
381		if (ret != LZMA_STREAM_END) {
382			// LZMA_BUFFER_ERROR means that the Index decoder
383			// would have liked more input than what the Index
384			// size should be according to Stream Footer.
385			// The message for LZMA_DATA_ERROR makes more
386			// sense in that case.
387			if (ret == LZMA_BUF_ERROR)
388				ret = LZMA_DATA_ERROR;
389
390			goto error;
391		}
392
393		// Decode the Stream Header and check that its Stream Flags
394		// match the Stream Footer.
395		pos -= footer_flags.backward_size + LZMA_STREAM_HEADER_SIZE;
396		if ((lzma_vli)(pos) < lzma_index_total_size(this_index)) {
397			goto error;
398		}
399
400		pos -= lzma_index_total_size(this_index);
401		if (io_pread(src_fd, &buf, LZMA_STREAM_HEADER_SIZE, pos))
402			goto error;
403
404		ret = lzma_stream_header_decode(&header_flags, buf.u8);
405		if (ret != LZMA_OK) {
406			goto error;
407		}
408
409		ret = lzma_stream_flags_compare(&header_flags, &footer_flags);
410		if (ret != LZMA_OK) {
411			goto error;
412		}
413
414		// Store the decoded Stream Flags into this_index. This is
415		// needed so that we can print which Check is used in each
416		// Stream.
417		ret = lzma_index_stream_flags(this_index, &footer_flags);
418		if (ret != LZMA_OK)
419			goto error;
420
421		// Store also the size of the Stream Padding field. It is
422		// needed to show the offsets of the Streams correctly.
423		ret = lzma_index_stream_padding(this_index, stream_padding);
424		if (ret != LZMA_OK)
425			goto error;
426
427		if (combined_index != NULL) {
428			// Append the earlier decoded Indexes
429			// after this_index.
430			ret = lzma_index_cat(
431					this_index, combined_index, NULL);
432			if (ret != LZMA_OK) {
433				goto error;
434			}
435		}
436
437		combined_index = this_index;
438		this_index = NULL;
439
440		xfi->stream_padding += stream_padding;
441
442	} while (pos > 0);
443
444	lzma_end(&strm);
445
446	// All OK. Make combined_index available to the caller.
447	xfi->idx = combined_index;
448	return false;
449
450error:
451	// Something went wrong, free the allocated memory.
452	lzma_end(&strm);
453	lzma_index_end(combined_index, NULL);
454	lzma_index_end(this_index, NULL);
455	return true;
456}
457
458/***************** end of copy form list.c *************************/
459
460/*
461 * Small wrapper to extract total length of a file
462 */
463off_t
464unxz_len(int fd)
465{
466	xz_file_info xfi = XZ_FILE_INFO_INIT;
467	if (!parse_indexes(&xfi, fd)) {
468		off_t res = lzma_index_uncompressed_size(xfi.idx);
469		lzma_index_end(xfi.idx, NULL);
470		return res;
471	}
472	return 0;
473}
474
475