1/*
2 * Copyright (c) 2004-2016 Maxim Sobolev <sobomax@FreeBSD.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: stable/10/usr.bin/mkuzip/mkuzip.c 319267 2017-05-30 22:48:17Z asomers $");
29
30#include <sys/types.h>
31#include <sys/disk.h>
32#include <sys/endian.h>
33#include <sys/param.h>
34#include <sys/sysctl.h>
35#include <sys/stat.h>
36#include <sys/uio.h>
37#include <netinet/in.h>
38#include <assert.h>
39#include <ctype.h>
40#include <err.h>
41#include <fcntl.h>
42#include <pthread.h>
43#include <signal.h>
44#include <stdint.h>
45#include <stdio.h>
46#include <stdlib.h>
47#include <string.h>
48#include <unistd.h>
49
50#include "mkuzip.h"
51#include "mkuz_cloop.h"
52#include "mkuz_blockcache.h"
53#include "mkuz_zlib.h"
54#include "mkuz_lzma.h"
55#include "mkuz_blk.h"
56#include "mkuz_cfg.h"
57#include "mkuz_conveyor.h"
58#include "mkuz_format.h"
59#include "mkuz_fqueue.h"
60#include "mkuz_time.h"
61
62#define DEFAULT_CLSTSIZE	16384
63
64static struct mkuz_format uzip_fmt = {
65	.magic = CLOOP_MAGIC_ZLIB,
66	.default_sufx = DEFAULT_SUFX_ZLIB,
67	.f_init = &mkuz_zlib_init,
68	.f_compress = &mkuz_zlib_compress
69};
70
71static struct mkuz_format ulzma_fmt = {
72        .magic = CLOOP_MAGIC_LZMA,
73        .default_sufx = DEFAULT_SUFX_LZMA,
74        .f_init = &mkuz_lzma_init,
75        .f_compress = &mkuz_lzma_compress
76};
77
78static struct mkuz_blk *readblock(int, u_int32_t);
79static void usage(void);
80static void cleanup(void);
81
82static char *cleanfile = NULL;
83
84static int
85cmp_blkno(const struct mkuz_blk *bp, void *p)
86{
87	uint32_t *ap;
88
89	ap = (uint32_t *)p;
90
91	return (bp->info.blkno == *ap);
92}
93
94int main(int argc, char **argv)
95{
96	struct mkuz_cfg cfs;
97	char *iname, *oname;
98	uint64_t *toc;
99	int i, io, opt, tmp;
100	struct {
101		int en;
102		FILE *f;
103	} summary;
104	struct iovec iov[2];
105	struct stat sb;
106	uint64_t offset, last_offset;
107	struct cloop_header hdr;
108	struct mkuz_conveyor *cvp;
109        void *c_ctx;
110	struct mkuz_blk_info *chit;
111	size_t ncpusz, ncpu, magiclen;
112	double st, et;
113
114	st = getdtime();
115
116	ncpusz = sizeof(size_t);
117	if (sysctlbyname("hw.ncpu", &ncpu, &ncpusz, NULL, 0) < 0) {
118		ncpu = 1;
119	} else if (ncpu > MAX_WORKERS_AUTO) {
120		ncpu = MAX_WORKERS_AUTO;
121	}
122
123	memset(&hdr, 0, sizeof(hdr));
124	cfs.blksz = DEFAULT_CLSTSIZE;
125	oname = NULL;
126	cfs.verbose = 0;
127	cfs.no_zcomp = 0;
128	cfs.en_dedup = 0;
129	summary.en = 0;
130	summary.f = stderr;
131	cfs.handler = &uzip_fmt;
132	cfs.nworkers = ncpu;
133	struct mkuz_blk *iblk, *oblk;
134
135	while((opt = getopt(argc, argv, "o:s:vZdLSj:")) != -1) {
136		switch(opt) {
137		case 'o':
138			oname = optarg;
139			break;
140
141		case 's':
142			tmp = atoi(optarg);
143			if (tmp <= 0) {
144				errx(1, "invalid cluster size specified: %s",
145				    optarg);
146				/* Not reached */
147			}
148			cfs.blksz = tmp;
149			break;
150
151		case 'v':
152			cfs.verbose = 1;
153			break;
154
155		case 'Z':
156			cfs.no_zcomp = 1;
157			break;
158
159		case 'd':
160			cfs.en_dedup = 1;
161			break;
162
163		case 'L':
164			cfs.handler = &ulzma_fmt;
165			break;
166
167		case 'S':
168			summary.en = 1;
169			summary.f = stdout;
170			break;
171
172		case 'j':
173			tmp = atoi(optarg);
174			if (tmp <= 0) {
175				errx(1, "invalid number of compression threads"
176                                    " specified: %s", optarg);
177				/* Not reached */
178			}
179			cfs.nworkers = tmp;
180			break;
181
182		default:
183			usage();
184			/* Not reached */
185		}
186	}
187	argc -= optind;
188	argv += optind;
189
190	if (argc != 1) {
191		usage();
192		/* Not reached */
193	}
194
195	magiclen = strlcpy(hdr.magic, cfs.handler->magic, sizeof(hdr.magic));
196	assert(magiclen < sizeof(hdr.magic));
197
198	if (cfs.en_dedup != 0) {
199		hdr.magic[CLOOP_OFS_VERSN] = CLOOP_MAJVER_3;
200		hdr.magic[CLOOP_OFS_COMPR] =
201		    tolower(hdr.magic[CLOOP_OFS_COMPR]);
202	}
203
204	c_ctx = cfs.handler->f_init(cfs.blksz);
205
206	iname = argv[0];
207	if (oname == NULL) {
208		asprintf(&oname, "%s%s", iname, cfs.handler->default_sufx);
209		if (oname == NULL) {
210			err(1, "can't allocate memory");
211			/* Not reached */
212		}
213	}
214
215	signal(SIGHUP, exit);
216	signal(SIGINT, exit);
217	signal(SIGTERM, exit);
218	signal(SIGXCPU, exit);
219	signal(SIGXFSZ, exit);
220	atexit(cleanup);
221
222	cfs.fdr = open(iname, O_RDONLY);
223	if (cfs.fdr < 0) {
224		err(1, "open(%s)", iname);
225		/* Not reached */
226	}
227	if (fstat(cfs.fdr, &sb) != 0) {
228		err(1, "fstat(%s)", iname);
229		/* Not reached */
230	}
231	if (S_ISCHR(sb.st_mode)) {
232		off_t ms;
233
234		if (ioctl(cfs.fdr, DIOCGMEDIASIZE, &ms) < 0) {
235			err(1, "ioctl(DIOCGMEDIASIZE)");
236			/* Not reached */
237		}
238		sb.st_size = ms;
239	} else if (!S_ISREG(sb.st_mode)) {
240		fprintf(stderr, "%s: not a character device or regular file\n",
241			iname);
242		exit(1);
243	}
244	hdr.nblocks = sb.st_size / cfs.blksz;
245	if ((sb.st_size % cfs.blksz) != 0) {
246		if (cfs.verbose != 0)
247			fprintf(stderr, "file size is not multiple "
248			"of %d, padding data\n", cfs.blksz);
249		hdr.nblocks++;
250	}
251	toc = mkuz_safe_malloc((hdr.nblocks + 1) * sizeof(*toc));
252
253	cfs.fdw = open(oname, (cfs.en_dedup ? O_RDWR : O_WRONLY) | O_TRUNC | O_CREAT,
254		   S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
255	if (cfs.fdw < 0) {
256		err(1, "open(%s)", oname);
257		/* Not reached */
258	}
259	cleanfile = oname;
260
261	/* Prepare header that we will write later when we have index ready. */
262	iov[0].iov_base = (char *)&hdr;
263	iov[0].iov_len = sizeof(hdr);
264	iov[1].iov_base = (char *)toc;
265	iov[1].iov_len = (hdr.nblocks + 1) * sizeof(*toc);
266	offset = iov[0].iov_len + iov[1].iov_len;
267
268	/* Reserve space for header */
269	lseek(cfs.fdw, offset, SEEK_SET);
270
271	if (cfs.verbose != 0) {
272		fprintf(stderr, "data size %ju bytes, number of clusters "
273		    "%u, index length %zu bytes\n", sb.st_size,
274		    hdr.nblocks, iov[1].iov_len);
275	}
276
277	cvp = mkuz_conveyor_ctor(&cfs);
278
279	last_offset = 0;
280        iblk = oblk = NULL;
281	for(i = io = 0; iblk != MKUZ_BLK_EOF; i++) {
282		iblk = readblock(cfs.fdr, cfs.blksz);
283		mkuz_fqueue_enq(cvp->wrk_queue, iblk);
284		if (iblk != MKUZ_BLK_EOF &&
285		    (i < (cfs.nworkers * ITEMS_PER_WORKER))) {
286			continue;
287		}
288drain:
289		oblk = mkuz_fqueue_deq_when(cvp->results, cmp_blkno, &io);
290		assert(oblk->info.blkno == (unsigned)io);
291		oblk->info.offset = offset;
292		chit = NULL;
293		if (cfs.en_dedup != 0 && oblk->info.len > 0) {
294			chit = mkuz_blkcache_regblock(cfs.fdw, oblk);
295			/*
296			 * There should be at least one non-empty block
297			 * between us and the backref'ed offset, otherwise
298			 * we won't be able to parse that sequence correctly
299			 * as it would be indistinguishible from another
300			 * empty block.
301			 */
302			if (chit != NULL && chit->offset == last_offset) {
303				chit = NULL;
304			}
305		}
306		if (chit != NULL) {
307			toc[io] = htobe64(chit->offset);
308			oblk->info.len = 0;
309		} else {
310			if (oblk->info.len > 0 && write(cfs.fdw, oblk->data,
311			    oblk->info.len) < 0) {
312				err(1, "write(%s)", oname);
313				/* Not reached */
314			}
315			toc[io] = htobe64(offset);
316			last_offset = offset;
317			offset += oblk->info.len;
318		}
319		if (cfs.verbose != 0) {
320			fprintf(stderr, "cluster #%d, in %u bytes, "
321			    "out len=%lu offset=%lu", io, cfs.blksz,
322			    (u_long)oblk->info.len, (u_long)be64toh(toc[io]));
323			if (chit != NULL) {
324				fprintf(stderr, " (backref'ed to #%d)",
325				    chit->blkno);
326			}
327			fprintf(stderr, "\n");
328		}
329		free(oblk);
330		io += 1;
331		if (iblk == MKUZ_BLK_EOF) {
332			if (io < i)
333				goto drain;
334			/* Last block, see if we need to add some padding */
335			if ((offset % DEV_BSIZE) == 0)
336				continue;
337			oblk = mkuz_blk_ctor(DEV_BSIZE - (offset % DEV_BSIZE));
338			oblk->info.blkno = io;
339			oblk->info.len = oblk->alen;
340			if (cfs.verbose != 0) {
341				fprintf(stderr, "padding data with %lu bytes "
342				    "so that file size is multiple of %d\n",
343				    (u_long)oblk->alen, DEV_BSIZE);
344			}
345			mkuz_fqueue_enq(cvp->results, oblk);
346			goto drain;
347		}
348	}
349
350	close(cfs.fdr);
351
352	if (cfs.verbose != 0 || summary.en != 0) {
353		et = getdtime();
354		fprintf(summary.f, "compressed data to %ju bytes, saved %lld "
355		    "bytes, %.2f%% decrease, %.2f bytes/sec.\n", offset,
356		    (long long)(sb.st_size - offset),
357		    100.0 * (long long)(sb.st_size - offset) /
358		    (float)sb.st_size, (float)sb.st_size / (et - st));
359	}
360
361	/* Convert to big endian */
362	hdr.blksz = htonl(cfs.blksz);
363	hdr.nblocks = htonl(hdr.nblocks);
364	/* Write headers into pre-allocated space */
365	lseek(cfs.fdw, 0, SEEK_SET);
366	if (writev(cfs.fdw, iov, 2) < 0) {
367		err(1, "writev(%s)", oname);
368		/* Not reached */
369	}
370	cleanfile = NULL;
371	close(cfs.fdw);
372
373	exit(0);
374}
375
376static struct mkuz_blk *
377readblock(int fd, u_int32_t clstsize)
378{
379	int numread;
380	struct mkuz_blk *rval;
381	static int blockcnt;
382	off_t cpos;
383
384	rval = mkuz_blk_ctor(clstsize);
385
386	rval->info.blkno = blockcnt;
387	blockcnt += 1;
388	cpos = lseek(fd, 0, SEEK_CUR);
389	if (cpos < 0) {
390		err(1, "readblock: lseek() failed");
391		/* Not reached */
392	}
393	rval->info.offset = cpos;
394
395	numread = read(fd, rval->data, clstsize);
396	if (numread < 0) {
397		err(1, "readblock: read() failed");
398		/* Not reached */
399	}
400	if (numread == 0) {
401		free(rval);
402		return MKUZ_BLK_EOF;
403	}
404	rval->info.len = numread;
405	return rval;
406}
407
408static void
409usage(void)
410{
411
412	fprintf(stderr, "usage: mkuzip [-vZdLS] [-o outfile] [-s cluster_size] "
413	    "[-j ncompr] infile\n");
414	exit(1);
415}
416
417void *
418mkuz_safe_malloc(size_t size)
419{
420	void *retval;
421
422	retval = malloc(size);
423	if (retval == NULL) {
424		err(1, "can't allocate memory");
425		/* Not reached */
426	}
427	return retval;
428}
429
430void *
431mkuz_safe_zmalloc(size_t size)
432{
433	void *retval;
434
435	retval = mkuz_safe_malloc(size);
436	bzero(retval, size);
437	return retval;
438}
439
440static void
441cleanup(void)
442{
443
444	if (cleanfile != NULL)
445		unlink(cleanfile);
446}
447
448int
449mkuz_memvcmp(const void *memory, unsigned char val, size_t size)
450{
451    const u_char *mm;
452
453    mm = (const u_char *)memory;
454    return (*mm == val) && memcmp(mm, mm + 1, size - 1) == 0;
455}
456