1/*	$NetBSD: unlz.c,v 1.6 2018/11/11 01:42:36 christos Exp $	*/
2
3/*-
4 * Copyright (c) 2018 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Christos Zoulas.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32/*  Lzd - Educational decompressor for the lzip format
33    Copyright (C) 2013-2018 Antonio Diaz Diaz.
34
35    This program is free software. Redistribution and use in source and
36    binary forms, with or without modification, are permitted provided
37    that the following conditions are met:
38
39    1. Redistributions of source code must retain the above copyright
40    notice, this list of conditions and the following disclaimer.
41
42    2. Redistributions in binary form must reproduce the above copyright
43    notice, this list of conditions and the following disclaimer in the
44    documentation and/or other materials provided with the distribution.
45
46    This program is distributed in the hope that it will be useful,
47    but WITHOUT ANY WARRANTY; without even the implied warranty of
48    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
49*/
50
51#include <sys/param.h>
52#include <stdio.h>
53#include <string.h>
54#include <stdlib.h>
55#include <stdint.h>
56#include <stdbool.h>
57#include <errno.h>
58#include <unistd.h>
59
60#define LZ_STATES		12
61
62#define LITERAL_CONTEXT_BITS	3
63#define POS_STATE_BITS		2
64#define POS_STATES		(1 << POS_STATE_BITS)
65#define POS_STATE_MASK 		(POS_STATES - 1)
66
67#define STATES			4
68#define DIS_SLOT_BITS		6
69
70#define DIS_MODEL_START		4
71#define DIS_MODEL_END		14
72
73#define MODELED_DISTANCES	(1 << (DIS_MODEL_END / 2))
74#define DIS_ALIGN_BITS		4
75#define DIS_ALIGN_SIZE		(1 << DIS_ALIGN_BITS)
76
77#define LOW_BITS		3
78#define MID_BITS		3
79#define HIGH_BITS		8
80
81#define LOW_SYMBOLS		(1 << LOW_BITS)
82#define MID_SYMBOLS		(1 << MID_BITS)
83#define HIGH_SYMBOLS		(1 << HIGH_BITS)
84
85#define MAX_SYMBOLS 		(LOW_SYMBOLS + MID_SYMBOLS + HIGH_SYMBOLS)
86
87#define MIN_MATCH_LEN		2
88
89#define BIT_MODEL_MOVE_BITS	5
90#define BIT_MODEL_TOTAL_BITS 	11
91#define BIT_MODEL_TOTAL 	(1 << BIT_MODEL_TOTAL_BITS)
92#define BIT_MODEL_INIT		(BIT_MODEL_TOTAL / 2)
93
94static const int lz_st_next[] = {
95	0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5,
96};
97
98static bool
99lz_st_is_char(int st) {
100	return st < 7;
101}
102
103static int
104lz_st_get_char(int st) {
105	return lz_st_next[st];
106}
107
108static int
109lz_st_get_match(int st) {
110	return st < 7 ? 7 : 10;
111}
112
113static int
114lz_st_get_rep(int st) {
115	return st < 7 ? 8 : 11;
116}
117
118static int
119lz_st_get_short_rep(int st) {
120	return st < 7 ? 9 : 11;
121}
122
123struct lz_len_model {
124	int choice1;
125	int choice2;
126	int bm_low[POS_STATES][LOW_SYMBOLS];
127	int bm_mid[POS_STATES][MID_SYMBOLS];
128	int bm_high[HIGH_SYMBOLS];
129};
130
131static uint32_t lz_crc[256];
132
133static void
134lz_crc_init(void)
135{
136	for (unsigned i = 0; i < nitems(lz_crc); i++) {
137		unsigned c = i;
138		for (unsigned j = 0; j < 8; j++) {
139			if (c & 1)
140				c = 0xEDB88320U ^ (c >> 1);
141			else
142				c >>= 1;
143		}
144		lz_crc[i] = c;
145      }
146}
147
148static void
149lz_crc_update(uint32_t *crc, const uint8_t *buf, size_t len)
150{
151	for (size_t i = 0; i < len; i++)
152		*crc = lz_crc[(*crc ^ buf[i]) & 0xFF] ^ (*crc >> 8);
153}
154
155struct lz_range_decoder {
156	FILE *fp;
157	uint32_t code;
158	uint32_t range;
159};
160
161static int
162lz_rd_create(struct lz_range_decoder *rd, FILE *fp)
163{
164	rd->fp = fp;
165	rd->code = 0;
166	rd->range = ~0;
167	for (int i = 0; i < 5; i++)
168		rd->code = (rd->code << 8) | (uint8_t)getc(rd->fp);
169	return ferror(rd->fp) ? -1 : 0;
170}
171
172static unsigned
173lz_rd_decode(struct lz_range_decoder *rd, int num_bits)
174{
175	unsigned symbol = 0;
176
177	for (int i = num_bits; i > 0; i--) {
178		rd->range >>= 1;
179		symbol <<= 1;
180		if (rd->code >= rd->range) {
181			rd->code -= rd->range;
182			symbol |= 1;
183		}
184		if (rd->range <= 0x00FFFFFFU) {
185			rd->range <<= 8;
186			rd->code = (rd->code << 8) | (uint8_t)getc(rd->fp);
187		}
188	}
189
190	return symbol;
191}
192
193static unsigned
194lz_rd_decode_bit(struct lz_range_decoder *rd, int *bm)
195{
196	unsigned symbol;
197	const uint32_t bound = (rd->range >> BIT_MODEL_TOTAL_BITS) * *bm;
198
199	if(rd->code < bound) {
200		rd->range = bound;
201		*bm += (BIT_MODEL_TOTAL - *bm) >> BIT_MODEL_MOVE_BITS;
202		symbol = 0;
203	}
204	else {
205		rd->range -= bound;
206		rd->code -= bound;
207		*bm -= *bm >> BIT_MODEL_MOVE_BITS;
208		symbol = 1;
209	}
210
211	if (rd->range <= 0x00FFFFFFU) {
212		rd->range <<= 8;
213		rd->code = (rd->code << 8) | (uint8_t)getc(rd->fp);
214	}
215	return symbol;
216}
217
218static unsigned
219lz_rd_decode_tree(struct lz_range_decoder *rd, int *bm, int num_bits)
220{
221	unsigned symbol = 1;
222
223	for (int i = 0; i < num_bits; i++)
224		symbol = (symbol << 1) | lz_rd_decode_bit(rd, &bm[symbol]);
225
226	return symbol - (1 << num_bits);
227}
228
229static unsigned
230lz_rd_decode_tree_reversed(struct lz_range_decoder *rd, int *bm, int num_bits)
231{
232	unsigned symbol = lz_rd_decode_tree(rd, bm, num_bits);
233	unsigned reversed_symbol = 0;
234
235	for (int i = 0; i < num_bits; i++) {
236		reversed_symbol = (reversed_symbol << 1) | (symbol & 1);
237		symbol >>= 1;
238	}
239
240	return reversed_symbol;
241}
242
243static unsigned
244lz_rd_decode_matched(struct lz_range_decoder *rd, int *bm, int match_byte)
245{
246	unsigned symbol = 1;
247
248	for (int i = 7; i >= 0; i--) {
249		const unsigned match_bit = (match_byte >> i) & 1;
250		const unsigned bit = lz_rd_decode_bit(rd,
251		    &bm[symbol + (match_bit << 8) + 0x100]);
252		symbol = (symbol << 1) | bit;
253		if (match_bit != bit) {
254			while (symbol < 0x100) {
255				symbol = (symbol << 1) |
256				    lz_rd_decode_bit(rd, &bm[symbol]);
257			}
258			break;
259		}
260	}
261	return symbol & 0xFF;
262}
263
264static unsigned
265lz_rd_decode_len(struct lz_range_decoder *rd, struct lz_len_model *lm,
266    int pos_state)
267{
268	if (lz_rd_decode_bit(rd, &lm->choice1) == 0)
269		return lz_rd_decode_tree(rd, lm->bm_low[pos_state], LOW_BITS);
270
271	if (lz_rd_decode_bit(rd, &lm->choice2) == 0) {
272		return LOW_SYMBOLS +
273		    lz_rd_decode_tree(rd, lm->bm_mid[pos_state], MID_BITS);
274	}
275
276	return LOW_SYMBOLS + MID_SYMBOLS +
277           lz_rd_decode_tree(rd, lm->bm_high, HIGH_BITS);
278}
279
280struct lz_decoder {
281	FILE *fin, *fout;
282	off_t pos, ppos, spos, dict_size;
283	bool wrapped;
284	uint32_t crc;
285	uint8_t *obuf;
286	struct lz_range_decoder rdec;
287};
288
289static int
290lz_flush(struct lz_decoder *lz)
291{
292	off_t offs = lz->pos - lz->spos;
293	if (offs <= 0)
294		return -1;
295
296	size_t size = (size_t)offs;
297	lz_crc_update(&lz->crc, lz->obuf + lz->spos, size);
298	if (fwrite(lz->obuf + lz->spos, 1, size, lz->fout) != size)
299		return -1;
300
301	lz->wrapped = lz->pos >= lz->dict_size;
302	if (lz->wrapped) {
303		lz->ppos += lz->pos;
304		lz->pos = 0;
305	}
306	lz->spos = lz->pos;
307	return 0;
308}
309
310static void
311lz_destroy(struct lz_decoder *lz)
312{
313	if (lz->fin)
314		fclose(lz->fin);
315	if (lz->fout)
316		fclose(lz->fout);
317	free(lz->obuf);
318}
319
320static int
321lz_create(struct lz_decoder *lz, int fin, int fdout, int dict_size)
322{
323	memset(lz, 0, sizeof(*lz));
324
325	lz->fin = fdopen(dup(fin), "r");
326	if (lz->fin == NULL)
327		goto out;
328
329	lz->fout = fdopen(dup(fdout), "w");
330	if (lz->fout == NULL)
331		goto out;
332
333	lz->pos = lz->ppos = lz->spos = 0;
334	lz->crc = ~0;
335	lz->dict_size = dict_size;
336	lz->wrapped = false;
337
338	lz->obuf = malloc(dict_size);
339	if (lz->obuf == NULL)
340		goto out;
341
342	if (lz_rd_create(&lz->rdec, lz->fin) == -1)
343		goto out;
344	return 0;
345out:
346	lz_destroy(lz);
347	return -1;
348}
349
350static uint8_t
351lz_peek(const struct lz_decoder *lz, unsigned ahead)
352{
353	off_t diff = lz->pos - ahead - 1;
354
355	if (diff >= 0)
356		return lz->obuf[diff];
357
358	if (lz->wrapped)
359		return lz->obuf[lz->dict_size + diff];
360
361	return 0;
362}
363
364static void
365lz_put(struct lz_decoder *lz, uint8_t b)
366{
367	lz->obuf[lz->pos++] = b;
368	if (lz->dict_size == lz->pos)
369		lz_flush(lz);
370}
371
372static off_t
373lz_get_data_position(const struct lz_decoder *lz)
374{
375	return lz->ppos + lz->pos;
376}
377
378static unsigned
379lz_get_crc(const struct lz_decoder *lz)
380{
381	return lz->crc ^ 0xffffffffU;
382}
383
384static void
385lz_bm_init(int *a, size_t l)
386{
387	for (size_t i = 0; i < l; i++)
388		a[i] = BIT_MODEL_INIT;
389}
390
391#define LZ_BM_INIT(a)	lz_bm_init(a, nitems(a))
392#define LZ_BM_INIT2(a)	do { \
393	size_t l = nitems(a[0]); \
394	for (size_t i = 0; i < nitems(a); i++) \
395		lz_bm_init(a[i], l); \
396} while (/*CONSTCOND*/0)
397
398#define LZ_MODEL_INIT(a) do { \
399	a.choice1 = BIT_MODEL_INIT; \
400	a.choice2 = BIT_MODEL_INIT; \
401	LZ_BM_INIT2(a.bm_low); \
402	LZ_BM_INIT2(a.bm_mid); \
403	LZ_BM_INIT(a.bm_high); \
404} while (/*CONSTCOND*/0)
405
406static bool
407lz_decode_member(struct lz_decoder *lz)
408{
409	int bm_literal[1 << LITERAL_CONTEXT_BITS][0x300];
410	int bm_match[LZ_STATES][POS_STATES];
411	int bm_rep[4][LZ_STATES];
412	int bm_len[LZ_STATES][POS_STATES];
413	int bm_dis_slot[LZ_STATES][1 << DIS_SLOT_BITS];
414	int bm_dis[MODELED_DISTANCES - DIS_MODEL_END + 1];
415	int bm_align[DIS_ALIGN_SIZE];
416
417	LZ_BM_INIT2(bm_literal);
418	LZ_BM_INIT2(bm_match);
419	LZ_BM_INIT2(bm_rep);
420	LZ_BM_INIT2(bm_len);
421	LZ_BM_INIT2(bm_dis_slot);
422	LZ_BM_INIT(bm_dis);
423	LZ_BM_INIT(bm_align);
424
425	struct lz_len_model match_len_model;
426	struct lz_len_model rep_len_model;
427
428	LZ_MODEL_INIT(match_len_model);
429	LZ_MODEL_INIT(rep_len_model);
430
431	struct lz_range_decoder *rd = &lz->rdec;
432	unsigned rep[4] = { 0 };
433
434
435	int state = 0;
436
437	while (!feof(lz->fin) && !ferror(lz->fin)) {
438		const int pos_state = lz_get_data_position(lz) & POS_STATE_MASK;
439		// bit 1
440		if (lz_rd_decode_bit(rd, &bm_match[state][pos_state]) == 0) {
441			const uint8_t prev_byte = lz_peek(lz, 0);
442			const int literal_state =
443			    prev_byte >> (8 - LITERAL_CONTEXT_BITS);
444			int *bm = bm_literal[literal_state];
445			if (lz_st_is_char(state))
446				lz_put(lz, lz_rd_decode_tree(rd, bm, 8));
447			else {
448				int peek = lz_peek(lz, rep[0]);
449				lz_put(lz, lz_rd_decode_matched(rd, bm, peek));
450			}
451			state = lz_st_get_char(state);
452			continue;
453		}
454		int len;
455		// bit 2
456		if (lz_rd_decode_bit(rd, &bm_rep[0][state]) != 0) {
457			// bit 3
458			if (lz_rd_decode_bit(rd, &bm_rep[1][state]) == 0) {
459				// bit 4
460				if (lz_rd_decode_bit(rd,
461				    &bm_len[state][pos_state]) == 0)
462				{
463					state = lz_st_get_short_rep(state);
464					lz_put(lz, lz_peek(lz, rep[0]));
465					continue;
466				}
467			} else {
468				unsigned distance;
469				// bit 4
470				if (lz_rd_decode_bit(rd, &bm_rep[2][state])
471				    == 0)
472					distance = rep[1];
473				else {
474					// bit 5
475					if (lz_rd_decode_bit(rd,
476					    &bm_rep[3][state]) == 0)
477						distance = rep[2];
478					else {
479						distance = rep[3];
480						rep[3] = rep[2];
481					}
482					rep[2] = rep[1];
483				}
484				rep[1] = rep[0];
485				rep[0] = distance;
486			}
487			state = lz_st_get_rep(state);
488			len = MIN_MATCH_LEN +
489			    lz_rd_decode_len(rd, &rep_len_model, pos_state);
490		} else {
491			rep[3] = rep[2]; rep[2] = rep[1]; rep[1] = rep[0];
492			len = MIN_MATCH_LEN +
493			    lz_rd_decode_len(rd, &match_len_model, pos_state);
494			const int len_state =
495			    MIN(len - MIN_MATCH_LEN, STATES - 1);
496			rep[0] = lz_rd_decode_tree(rd, bm_dis_slot[len_state],
497			    DIS_SLOT_BITS);
498			if (rep[0] >= DIS_MODEL_START) {
499				const unsigned dis_slot = rep[0];
500				const int direct_bits = (dis_slot >> 1) - 1;
501			        rep[0] = (2 | (dis_slot & 1)) << direct_bits;
502				if (dis_slot < DIS_MODEL_END)
503					rep[0] += lz_rd_decode_tree_reversed(rd,
504					    &bm_dis[rep[0] - dis_slot],
505                                            direct_bits);
506				else {
507					rep[0] += lz_rd_decode(rd, direct_bits
508					    - DIS_ALIGN_BITS) << DIS_ALIGN_BITS;
509					rep[0] += lz_rd_decode_tree_reversed(rd,
510					    bm_align, DIS_ALIGN_BITS);
511					if (rep[0] == 0xFFFFFFFFU) {
512						lz_flush(lz);
513						return len == MIN_MATCH_LEN;
514					}
515				}
516			}
517			state = lz_st_get_match(state);
518			if (rep[0] >= lz->dict_size ||
519			    (rep[0] >= lz->pos && !lz->wrapped)) {
520				lz_flush(lz);
521				return false;
522			}
523		}
524		for (int i = 0; i < len; i++)
525			lz_put(lz, lz_peek(lz, rep[0]));
526    	}
527	lz_flush(lz);
528	return false;
529}
530
531/*
532 * 0-3	CRC32 of the uncompressed data
533 * 4-11 size of the uncompressed data
534 * 12-19 member size including header and trailer
535 */
536#define TRAILER_SIZE 20
537
538
539static off_t
540lz_decode(int fin, int fdout, unsigned dict_size, off_t *insize)
541{
542	struct lz_decoder lz;
543	off_t rv = -1;
544
545	if (lz_create(&lz, fin, fdout, dict_size) == -1)
546		return -1;
547
548	if (!lz_decode_member(&lz))
549		goto out;
550
551	uint8_t trailer[TRAILER_SIZE];
552
553	for(size_t i = 0; i < nitems(trailer); i++)
554		trailer[i] = (uint8_t)getc(lz.fin);
555
556	unsigned crc = 0;
557	for (int i = 3; i >= 0; --i) {
558		crc <<= 8;
559		crc += trailer[i];
560	}
561
562	int64_t data_size = 0;
563	for (int i = 11; i >= 4; --i) {
564		data_size <<= 8;
565		data_size += trailer[i];
566	}
567
568	if (crc != lz_get_crc(&lz) || data_size != lz_get_data_position(&lz))
569		goto out;
570
571	rv = 0;
572	for (int i = 19; i >= 12; --i) {
573		rv <<= 8;
574		rv += trailer[i];
575	}
576	if (insize)
577		*insize = rv;
578#if 0
579	/* Does not work with pipes */
580	rv = ftello(lz.fout);
581#else
582	rv = data_size;
583#endif
584out:
585	lz_destroy(&lz);
586	return rv;
587}
588
589
590/*
591 * 0-3 magic
592 * 4 version
593 * 5 coded dict_size
594 */
595#define HDR_SIZE 6
596#define MIN_DICTIONARY_SIZE (1 << 12)
597#define MAX_DICTIONARY_SIZE (1 << 29)
598
599static const char hdrmagic[] = { 'L', 'Z', 'I', 'P', 1 };
600
601static unsigned
602lz_get_dict_size(unsigned char c)
603{
604	unsigned dict_size = 1 << (c & 0x1f);
605	dict_size -= (dict_size >> 2) * ( (c >> 5) & 0x7);
606	if (dict_size < MIN_DICTIONARY_SIZE || dict_size > MAX_DICTIONARY_SIZE)
607		return 0;
608	return dict_size;
609}
610
611static off_t
612unlz(int fin, int fout, char *pre, size_t prelen, off_t *bytes_in)
613{
614	if (lz_crc[0] == 0)
615		lz_crc_init();
616
617	char header[HDR_SIZE];
618
619	if (pre && prelen)
620		memcpy(header, pre, prelen);
621
622	ssize_t nr = read(fin, header + prelen, sizeof(header) - prelen);
623	switch (nr) {
624	case -1:
625		return -1;
626	case 0:
627		return prelen ? -1 : 0;
628	default:
629		if ((size_t)nr != sizeof(header) - prelen)
630			return -1;
631		break;
632	}
633
634	if (memcmp(header, hdrmagic, sizeof(hdrmagic)) != 0)
635		return -1;
636
637	unsigned dict_size = lz_get_dict_size(header[5]);
638	if (dict_size == 0)
639		return -1;
640
641	return lz_decode(fin, fout, dict_size, bytes_in);
642}
643