1/*	$NetBSD: unlz.c,v 1.6 2018/11/11 01:42:36 christos Exp $	*/
2
3/*-
4 * Copyright (c) 2018 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Christos Zoulas.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 * $FreeBSD$
32 */
33
34/*  Lzd - Educational decompressor for the lzip format
35    Copyright (C) 2013-2018 Antonio Diaz Diaz.
36
37    This program is free software. Redistribution and use in source and
38    binary forms, with or without modification, are permitted provided
39    that the following conditions are met:
40
41    1. Redistributions of source code must retain the above copyright
42    notice, this list of conditions and the following disclaimer.
43
44    2. Redistributions in binary form must reproduce the above copyright
45    notice, this list of conditions and the following disclaimer in the
46    documentation and/or other materials provided with the distribution.
47
48    This program is distributed in the hope that it will be useful,
49    but WITHOUT ANY WARRANTY; without even the implied warranty of
50    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
51*/
52
53#include <sys/param.h>
54#include <stdio.h>
55#include <string.h>
56#include <stdlib.h>
57#include <stdint.h>
58#include <stdbool.h>
59#include <errno.h>
60#include <unistd.h>
61
62#define LZ_STATES		12
63
64#define LITERAL_CONTEXT_BITS	3
65#define POS_STATE_BITS		2
66#define POS_STATES		(1 << POS_STATE_BITS)
67#define POS_STATE_MASK 		(POS_STATES - 1)
68
69#define STATES			4
70#define DIS_SLOT_BITS		6
71
72#define DIS_MODEL_START		4
73#define DIS_MODEL_END		14
74
75#define MODELED_DISTANCES	(1 << (DIS_MODEL_END / 2))
76#define DIS_ALIGN_BITS		4
77#define DIS_ALIGN_SIZE		(1 << DIS_ALIGN_BITS)
78
79#define LOW_BITS		3
80#define MID_BITS		3
81#define HIGH_BITS		8
82
83#define LOW_SYMBOLS		(1 << LOW_BITS)
84#define MID_SYMBOLS		(1 << MID_BITS)
85#define HIGH_SYMBOLS		(1 << HIGH_BITS)
86
87#define MAX_SYMBOLS 		(LOW_SYMBOLS + MID_SYMBOLS + HIGH_SYMBOLS)
88
89#define MIN_MATCH_LEN		2
90
91#define BIT_MODEL_MOVE_BITS	5
92#define BIT_MODEL_TOTAL_BITS 	11
93#define BIT_MODEL_TOTAL 	(1 << BIT_MODEL_TOTAL_BITS)
94#define BIT_MODEL_INIT		(BIT_MODEL_TOTAL / 2)
95
96static const int lz_st_next[] = {
97	0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5,
98};
99
100static bool
101lz_st_is_char(int st) {
102	return st < 7;
103}
104
105static int
106lz_st_get_char(int st) {
107	return lz_st_next[st];
108}
109
110static int
111lz_st_get_match(int st) {
112	return st < 7 ? 7 : 10;
113}
114
115static int
116lz_st_get_rep(int st) {
117	return st < 7 ? 8 : 11;
118}
119
120static int
121lz_st_get_short_rep(int st) {
122	return st < 7 ? 9 : 11;
123}
124
125struct lz_len_model {
126	int choice1;
127	int choice2;
128	int bm_low[POS_STATES][LOW_SYMBOLS];
129	int bm_mid[POS_STATES][MID_SYMBOLS];
130	int bm_high[HIGH_SYMBOLS];
131};
132
133static uint32_t lz_crc[256];
134
135static void
136lz_crc_init(void)
137{
138	for (unsigned i = 0; i < nitems(lz_crc); i++) {
139		unsigned c = i;
140		for (unsigned j = 0; j < 8; j++) {
141			if (c & 1)
142				c = 0xEDB88320U ^ (c >> 1);
143			else
144				c >>= 1;
145		}
146		lz_crc[i] = c;
147      }
148}
149
150static void
151lz_crc_update(uint32_t *crc, const uint8_t *buf, size_t len)
152{
153	for (size_t i = 0; i < len; i++)
154		*crc = lz_crc[(*crc ^ buf[i]) & 0xFF] ^ (*crc >> 8);
155}
156
157struct lz_range_decoder {
158	FILE *fp;
159	uint32_t code;
160	uint32_t range;
161};
162
163static int
164lz_rd_create(struct lz_range_decoder *rd, FILE *fp)
165{
166	rd->fp = fp;
167	rd->code = 0;
168	rd->range = ~0;
169	for (int i = 0; i < 5; i++)
170		rd->code = (rd->code << 8) | (uint8_t)getc(rd->fp);
171	return ferror(rd->fp) ? -1 : 0;
172}
173
174static unsigned
175lz_rd_decode(struct lz_range_decoder *rd, int num_bits)
176{
177	unsigned symbol = 0;
178
179	for (int i = num_bits; i > 0; i--) {
180		rd->range >>= 1;
181		symbol <<= 1;
182		if (rd->code >= rd->range) {
183			rd->code -= rd->range;
184			symbol |= 1;
185		}
186		if (rd->range <= 0x00FFFFFFU) {
187			rd->range <<= 8;
188			rd->code = (rd->code << 8) | (uint8_t)getc(rd->fp);
189		}
190	}
191
192	return symbol;
193}
194
195static unsigned
196lz_rd_decode_bit(struct lz_range_decoder *rd, int *bm)
197{
198	unsigned symbol;
199	const uint32_t bound = (rd->range >> BIT_MODEL_TOTAL_BITS) * *bm;
200
201	if(rd->code < bound) {
202		rd->range = bound;
203		*bm += (BIT_MODEL_TOTAL - *bm) >> BIT_MODEL_MOVE_BITS;
204		symbol = 0;
205	}
206	else {
207		rd->range -= bound;
208		rd->code -= bound;
209		*bm -= *bm >> BIT_MODEL_MOVE_BITS;
210		symbol = 1;
211	}
212
213	if (rd->range <= 0x00FFFFFFU) {
214		rd->range <<= 8;
215		rd->code = (rd->code << 8) | (uint8_t)getc(rd->fp);
216	}
217	return symbol;
218}
219
220static unsigned
221lz_rd_decode_tree(struct lz_range_decoder *rd, int *bm, int num_bits)
222{
223	unsigned symbol = 1;
224
225	for (int i = 0; i < num_bits; i++)
226		symbol = (symbol << 1) | lz_rd_decode_bit(rd, &bm[symbol]);
227
228	return symbol - (1 << num_bits);
229}
230
231static unsigned
232lz_rd_decode_tree_reversed(struct lz_range_decoder *rd, int *bm, int num_bits)
233{
234	unsigned symbol = lz_rd_decode_tree(rd, bm, num_bits);
235	unsigned reversed_symbol = 0;
236
237	for (int i = 0; i < num_bits; i++) {
238		reversed_symbol = (reversed_symbol << 1) | (symbol & 1);
239		symbol >>= 1;
240	}
241
242	return reversed_symbol;
243}
244
245static unsigned
246lz_rd_decode_matched(struct lz_range_decoder *rd, int *bm, int match_byte)
247{
248	unsigned symbol = 1;
249
250	for (int i = 7; i >= 0; i--) {
251		const unsigned match_bit = (match_byte >> i) & 1;
252		const unsigned bit = lz_rd_decode_bit(rd,
253		    &bm[symbol + (match_bit << 8) + 0x100]);
254		symbol = (symbol << 1) | bit;
255		if (match_bit != bit) {
256			while (symbol < 0x100) {
257				symbol = (symbol << 1) |
258				    lz_rd_decode_bit(rd, &bm[symbol]);
259			}
260			break;
261		}
262	}
263	return symbol & 0xFF;
264}
265
266static unsigned
267lz_rd_decode_len(struct lz_range_decoder *rd, struct lz_len_model *lm,
268    int pos_state)
269{
270	if (lz_rd_decode_bit(rd, &lm->choice1) == 0)
271		return lz_rd_decode_tree(rd, lm->bm_low[pos_state], LOW_BITS);
272
273	if (lz_rd_decode_bit(rd, &lm->choice2) == 0) {
274		return LOW_SYMBOLS +
275		    lz_rd_decode_tree(rd, lm->bm_mid[pos_state], MID_BITS);
276	}
277
278	return LOW_SYMBOLS + MID_SYMBOLS +
279           lz_rd_decode_tree(rd, lm->bm_high, HIGH_BITS);
280}
281
282struct lz_decoder {
283	FILE *fin, *fout;
284	off_t pos, ppos, spos, dict_size;
285	bool wrapped;
286	uint32_t crc;
287	uint8_t *obuf;
288	struct lz_range_decoder rdec;
289};
290
291static int
292lz_flush(struct lz_decoder *lz)
293{
294	off_t offs = lz->pos - lz->spos;
295	if (offs <= 0)
296		return -1;
297
298	size_t size = (size_t)offs;
299	lz_crc_update(&lz->crc, lz->obuf + lz->spos, size);
300	if (fwrite(lz->obuf + lz->spos, 1, size, lz->fout) != size)
301		return -1;
302
303	lz->wrapped = lz->pos >= lz->dict_size;
304	if (lz->wrapped) {
305		lz->ppos += lz->pos;
306		lz->pos = 0;
307	}
308	lz->spos = lz->pos;
309	return 0;
310}
311
312static void
313lz_destroy(struct lz_decoder *lz)
314{
315	if (lz->fin)
316		fclose(lz->fin);
317	if (lz->fout)
318		fclose(lz->fout);
319	free(lz->obuf);
320}
321
322static int
323lz_create(struct lz_decoder *lz, int fin, int fdout, int dict_size)
324{
325	memset(lz, 0, sizeof(*lz));
326
327	lz->fin = fdopen(dup(fin), "r");
328	if (lz->fin == NULL)
329		goto out;
330
331	lz->fout = fdopen(dup(fdout), "w");
332	if (lz->fout == NULL)
333		goto out;
334
335	lz->pos = lz->ppos = lz->spos = 0;
336	lz->crc = ~0;
337	lz->dict_size = dict_size;
338	lz->wrapped = false;
339
340	lz->obuf = malloc(dict_size);
341	if (lz->obuf == NULL)
342		goto out;
343
344	if (lz_rd_create(&lz->rdec, lz->fin) == -1)
345		goto out;
346	return 0;
347out:
348	lz_destroy(lz);
349	return -1;
350}
351
352static uint8_t
353lz_peek(const struct lz_decoder *lz, unsigned ahead)
354{
355	off_t diff = lz->pos - ahead - 1;
356
357	if (diff >= 0)
358		return lz->obuf[diff];
359
360	if (lz->wrapped)
361		return lz->obuf[lz->dict_size + diff];
362
363	return 0;
364}
365
366static void
367lz_put(struct lz_decoder *lz, uint8_t b)
368{
369	lz->obuf[lz->pos++] = b;
370	if (lz->dict_size == lz->pos)
371		lz_flush(lz);
372}
373
374static off_t
375lz_get_data_position(const struct lz_decoder *lz)
376{
377	return lz->ppos + lz->pos;
378}
379
380static unsigned
381lz_get_crc(const struct lz_decoder *lz)
382{
383	return lz->crc ^ 0xffffffffU;
384}
385
386static void
387lz_bm_init(int *a, size_t l)
388{
389	for (size_t i = 0; i < l; i++)
390		a[i] = BIT_MODEL_INIT;
391}
392
393#define LZ_BM_INIT(a)	lz_bm_init(a, nitems(a))
394#define LZ_BM_INIT2(a)	do { \
395	size_t l = nitems(a[0]); \
396	for (size_t i = 0; i < nitems(a); i++) \
397		lz_bm_init(a[i], l); \
398} while (/*CONSTCOND*/0)
399
400#define LZ_MODEL_INIT(a) do { \
401	a.choice1 = BIT_MODEL_INIT; \
402	a.choice2 = BIT_MODEL_INIT; \
403	LZ_BM_INIT2(a.bm_low); \
404	LZ_BM_INIT2(a.bm_mid); \
405	LZ_BM_INIT(a.bm_high); \
406} while (/*CONSTCOND*/0)
407
408static bool
409lz_decode_member(struct lz_decoder *lz)
410{
411	int bm_literal[1 << LITERAL_CONTEXT_BITS][0x300];
412	int bm_match[LZ_STATES][POS_STATES];
413	int bm_rep[4][LZ_STATES];
414	int bm_len[LZ_STATES][POS_STATES];
415	int bm_dis_slot[LZ_STATES][1 << DIS_SLOT_BITS];
416	int bm_dis[MODELED_DISTANCES - DIS_MODEL_END + 1];
417	int bm_align[DIS_ALIGN_SIZE];
418
419	LZ_BM_INIT2(bm_literal);
420	LZ_BM_INIT2(bm_match);
421	LZ_BM_INIT2(bm_rep);
422	LZ_BM_INIT2(bm_len);
423	LZ_BM_INIT2(bm_dis_slot);
424	LZ_BM_INIT(bm_dis);
425	LZ_BM_INIT(bm_align);
426
427	struct lz_len_model match_len_model;
428	struct lz_len_model rep_len_model;
429
430	LZ_MODEL_INIT(match_len_model);
431	LZ_MODEL_INIT(rep_len_model);
432
433	struct lz_range_decoder *rd = &lz->rdec;
434	unsigned rep[4] = { 0 };
435
436
437	int state = 0;
438
439	while (!feof(lz->fin) && !ferror(lz->fin)) {
440		const int pos_state = lz_get_data_position(lz) & POS_STATE_MASK;
441		// bit 1
442		if (lz_rd_decode_bit(rd, &bm_match[state][pos_state]) == 0) {
443			const uint8_t prev_byte = lz_peek(lz, 0);
444			const int literal_state =
445			    prev_byte >> (8 - LITERAL_CONTEXT_BITS);
446			int *bm = bm_literal[literal_state];
447			if (lz_st_is_char(state))
448				lz_put(lz, lz_rd_decode_tree(rd, bm, 8));
449			else {
450				int peek = lz_peek(lz, rep[0]);
451				lz_put(lz, lz_rd_decode_matched(rd, bm, peek));
452			}
453			state = lz_st_get_char(state);
454			continue;
455		}
456		int len;
457		// bit 2
458		if (lz_rd_decode_bit(rd, &bm_rep[0][state]) != 0) {
459			// bit 3
460			if (lz_rd_decode_bit(rd, &bm_rep[1][state]) == 0) {
461				// bit 4
462				if (lz_rd_decode_bit(rd,
463				    &bm_len[state][pos_state]) == 0)
464				{
465					state = lz_st_get_short_rep(state);
466					lz_put(lz, lz_peek(lz, rep[0]));
467					continue;
468				}
469			} else {
470				unsigned distance;
471				// bit 4
472				if (lz_rd_decode_bit(rd, &bm_rep[2][state])
473				    == 0)
474					distance = rep[1];
475				else {
476					// bit 5
477					if (lz_rd_decode_bit(rd,
478					    &bm_rep[3][state]) == 0)
479						distance = rep[2];
480					else {
481						distance = rep[3];
482						rep[3] = rep[2];
483					}
484					rep[2] = rep[1];
485				}
486				rep[1] = rep[0];
487				rep[0] = distance;
488			}
489			state = lz_st_get_rep(state);
490			len = MIN_MATCH_LEN +
491			    lz_rd_decode_len(rd, &rep_len_model, pos_state);
492		} else {
493			rep[3] = rep[2]; rep[2] = rep[1]; rep[1] = rep[0];
494			len = MIN_MATCH_LEN +
495			    lz_rd_decode_len(rd, &match_len_model, pos_state);
496			const int len_state =
497			    MIN(len - MIN_MATCH_LEN, STATES - 1);
498			rep[0] = lz_rd_decode_tree(rd, bm_dis_slot[len_state],
499			    DIS_SLOT_BITS);
500			if (rep[0] >= DIS_MODEL_START) {
501				const unsigned dis_slot = rep[0];
502				const int direct_bits = (dis_slot >> 1) - 1;
503			        rep[0] = (2 | (dis_slot & 1)) << direct_bits;
504				if (dis_slot < DIS_MODEL_END)
505					rep[0] += lz_rd_decode_tree_reversed(rd,
506					    &bm_dis[rep[0] - dis_slot],
507                                            direct_bits);
508				else {
509					rep[0] += lz_rd_decode(rd, direct_bits
510					    - DIS_ALIGN_BITS) << DIS_ALIGN_BITS;
511					rep[0] += lz_rd_decode_tree_reversed(rd,
512					    bm_align, DIS_ALIGN_BITS);
513					if (rep[0] == 0xFFFFFFFFU) {
514						lz_flush(lz);
515						return len == MIN_MATCH_LEN;
516					}
517				}
518			}
519			state = lz_st_get_match(state);
520			if (rep[0] >= lz->dict_size ||
521			    (rep[0] >= lz->pos && !lz->wrapped)) {
522				lz_flush(lz);
523				return false;
524			}
525		}
526		for (int i = 0; i < len; i++)
527			lz_put(lz, lz_peek(lz, rep[0]));
528    	}
529	lz_flush(lz);
530	return false;
531}
532
533/*
534 * 0-3	CRC32 of the uncompressed data
535 * 4-11 size of the uncompressed data
536 * 12-19 member size including header and trailer
537 */
538#define TRAILER_SIZE 20
539
540
541static off_t
542lz_decode(int fin, int fdout, unsigned dict_size, off_t *insize)
543{
544	struct lz_decoder lz;
545	off_t rv = -1;
546
547	if (lz_create(&lz, fin, fdout, dict_size) == -1)
548		return -1;
549
550	if (!lz_decode_member(&lz))
551		goto out;
552
553	uint8_t trailer[TRAILER_SIZE];
554
555	for(size_t i = 0; i < nitems(trailer); i++)
556		trailer[i] = (uint8_t)getc(lz.fin);
557
558	unsigned crc = 0;
559	for (int i = 3; i >= 0; --i) {
560		crc <<= 8;
561		crc += trailer[i];
562	}
563
564	int64_t data_size = 0;
565	for (int i = 11; i >= 4; --i) {
566		data_size <<= 8;
567		data_size += trailer[i];
568	}
569
570	if (crc != lz_get_crc(&lz) || data_size != lz_get_data_position(&lz))
571		goto out;
572
573	rv = 0;
574	for (int i = 19; i >= 12; --i) {
575		rv <<= 8;
576		rv += trailer[i];
577	}
578	if (insize)
579		*insize = rv;
580#if 0
581	/* Does not work with pipes */
582	rv = ftello(lz.fout);
583#else
584	rv = data_size;
585#endif
586out:
587	lz_destroy(&lz);
588	return rv;
589}
590
591
592/*
593 * 0-3 magic
594 * 4 version
595 * 5 coded dict_size
596 */
597#define HDR_SIZE 6
598#define MIN_DICTIONARY_SIZE (1 << 12)
599#define MAX_DICTIONARY_SIZE (1 << 29)
600
601static const char hdrmagic[] = { 'L', 'Z', 'I', 'P', 1 };
602
603static unsigned
604lz_get_dict_size(unsigned char c)
605{
606	unsigned dict_size = 1 << (c & 0x1f);
607	dict_size -= (dict_size >> 2) * ( (c >> 5) & 0x7);
608	if (dict_size < MIN_DICTIONARY_SIZE || dict_size > MAX_DICTIONARY_SIZE)
609		return 0;
610	return dict_size;
611}
612
613static off_t
614unlz(int fin, int fout, char *pre, size_t prelen, off_t *bytes_in)
615{
616	if (lz_crc[0] == 0)
617		lz_crc_init();
618
619	char header[HDR_SIZE];
620
621	if (pre && prelen)
622		memcpy(header, pre, prelen);
623
624	ssize_t nr = read(fin, header + prelen, sizeof(header) - prelen);
625	switch (nr) {
626	case -1:
627		return -1;
628	case 0:
629		return prelen ? -1 : 0;
630	default:
631		if ((size_t)nr != sizeof(header) - prelen)
632			return -1;
633		break;
634	}
635
636	if (memcmp(header, hdrmagic, sizeof(hdrmagic)) != 0)
637		return -1;
638
639	unsigned dict_size = lz_get_dict_size(header[5]);
640	if (dict_size == 0)
641		return -1;
642
643	return lz_decode(fin, fout, dict_size, bytes_in);
644}
645