mkuzip.c revision 303095
1135045Ssobomax/*
2303095Ssobomax * Copyright (c) 2004-2016 Maxim Sobolev <sobomax@FreeBSD.org>
3303095Ssobomax * All rights reserved.
4135045Ssobomax *
5303095Ssobomax * Redistribution and use in source and binary forms, with or without
6303095Ssobomax * modification, are permitted provided that the following conditions
7303095Ssobomax * are met:
8303095Ssobomax * 1. Redistributions of source code must retain the above copyright
9303095Ssobomax *    notice, this list of conditions and the following disclaimer.
10303095Ssobomax * 2. Redistributions in binary form must reproduce the above copyright
11303095Ssobomax *    notice, this list of conditions and the following disclaimer in the
12303095Ssobomax *    documentation and/or other materials provided with the distribution.
13135045Ssobomax *
14303095Ssobomax * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15303095Ssobomax * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16303095Ssobomax * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17303095Ssobomax * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18303095Ssobomax * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19303095Ssobomax * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20303095Ssobomax * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21303095Ssobomax * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22303095Ssobomax * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23303095Ssobomax * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24303095Ssobomax * SUCH DAMAGE.
25135045Ssobomax */
26135045Ssobomax
27303095Ssobomax#include <sys/cdefs.h>
28303095Ssobomax__FBSDID("$FreeBSD: stable/10/usr.bin/mkuzip/mkuzip.c 303095 2016-07-20 16:36:17Z sobomax $");
29303095Ssobomax
30135045Ssobomax#include <sys/types.h>
31167272Sfjoe#include <sys/disk.h>
32135045Ssobomax#include <sys/endian.h>
33135045Ssobomax#include <sys/param.h>
34303095Ssobomax#include <sys/sysctl.h>
35135045Ssobomax#include <sys/stat.h>
36135045Ssobomax#include <sys/uio.h>
37135045Ssobomax#include <netinet/in.h>
38303095Ssobomax#include <assert.h>
39303095Ssobomax#include <ctype.h>
40135045Ssobomax#include <err.h>
41135045Ssobomax#include <fcntl.h>
42303095Ssobomax#include <pthread.h>
43135045Ssobomax#include <signal.h>
44303095Ssobomax#include <stdint.h>
45135045Ssobomax#include <stdio.h>
46135045Ssobomax#include <stdlib.h>
47135045Ssobomax#include <string.h>
48135045Ssobomax#include <unistd.h>
49135045Ssobomax
50303095Ssobomax#include "mkuzip.h"
51303095Ssobomax#include "mkuz_cloop.h"
52303095Ssobomax#include "mkuz_blockcache.h"
53303095Ssobomax#include "mkuz_zlib.h"
54303095Ssobomax#include "mkuz_lzma.h"
55303095Ssobomax#include "mkuz_blk.h"
56303095Ssobomax#include "mkuz_cfg.h"
57303095Ssobomax#include "mkuz_conveyor.h"
58303095Ssobomax#include "mkuz_format.h"
59303095Ssobomax#include "mkuz_fqueue.h"
60303095Ssobomax#include "mkuz_time.h"
61135045Ssobomax
62303095Ssobomax#define DEFAULT_CLSTSIZE	16384
63135045Ssobomax
64303095Ssobomaxstatic struct mkuz_format uzip_fmt = {
65303095Ssobomax	.magic = CLOOP_MAGIC_ZLIB,
66303095Ssobomax	.default_sufx = DEFAULT_SUFX_ZLIB,
67303095Ssobomax	.f_init = &mkuz_zlib_init,
68303095Ssobomax	.f_compress = &mkuz_zlib_compress
69303095Ssobomax};
70303095Ssobomax
71303095Ssobomaxstatic struct mkuz_format ulzma_fmt = {
72303095Ssobomax        .magic = CLOOP_MAGIC_LZMA,
73303095Ssobomax        .default_sufx = DEFAULT_SUFX_LZMA,
74303095Ssobomax        .f_init = &mkuz_lzma_init,
75303095Ssobomax        .f_compress = &mkuz_lzma_compress
76303095Ssobomax};
77303095Ssobomax
78303095Ssobomaxstatic struct mkuz_blk *readblock(int, u_int32_t);
79135045Ssobomaxstatic void usage(void);
80135045Ssobomaxstatic void cleanup(void);
81135045Ssobomax
82135045Ssobomaxstatic char *cleanfile = NULL;
83135045Ssobomax
84303095Ssobomaxstatic int
85303095Ssobomaxcmp_blkno(const struct mkuz_blk *bp, void *p)
86303095Ssobomax{
87303095Ssobomax	uint32_t *ap;
88303095Ssobomax
89303095Ssobomax	ap = (uint32_t *)p;
90303095Ssobomax
91303095Ssobomax	return (bp->info.blkno == *ap);
92303095Ssobomax}
93303095Ssobomax
94135045Ssobomaxint main(int argc, char **argv)
95135045Ssobomax{
96303095Ssobomax	struct mkuz_cfg cfs;
97303095Ssobomax	char *iname, *oname;
98135045Ssobomax	uint64_t *toc;
99303095Ssobomax	int i, io, opt, tmp;
100303095Ssobomax	struct {
101303095Ssobomax		int en;
102303095Ssobomax		FILE *f;
103303095Ssobomax	} summary;
104135045Ssobomax	struct iovec iov[2];
105135045Ssobomax	struct stat sb;
106303095Ssobomax	uint64_t offset, last_offset;
107303095Ssobomax	struct cloop_header hdr;
108303095Ssobomax	struct mkuz_conveyor *cvp;
109303095Ssobomax        void *c_ctx;
110303095Ssobomax	struct mkuz_blk_info *chit;
111303095Ssobomax	size_t ncpusz, ncpu;
112303095Ssobomax	double st, et;
113135045Ssobomax
114303095Ssobomax	st = getdtime();
115303095Ssobomax
116303095Ssobomax	ncpusz = sizeof(size_t);
117303095Ssobomax	if (sysctlbyname("hw.ncpu", &ncpu, &ncpusz, NULL, 0) < 0) {
118303095Ssobomax		ncpu = 1;
119303095Ssobomax	} else if (ncpu > MAX_WORKERS_AUTO) {
120303095Ssobomax		ncpu = MAX_WORKERS_AUTO;
121303095Ssobomax	}
122303095Ssobomax
123135045Ssobomax	memset(&hdr, 0, sizeof(hdr));
124303095Ssobomax	cfs.blksz = DEFAULT_CLSTSIZE;
125135045Ssobomax	oname = NULL;
126303095Ssobomax	cfs.verbose = 0;
127303095Ssobomax	cfs.no_zcomp = 0;
128303095Ssobomax	cfs.en_dedup = 0;
129303095Ssobomax	summary.en = 0;
130303095Ssobomax	summary.f = stderr;
131303095Ssobomax	cfs.handler = &uzip_fmt;
132303095Ssobomax	cfs.nworkers = ncpu;
133303095Ssobomax	struct mkuz_blk *iblk, *oblk;
134135045Ssobomax
135303095Ssobomax	while((opt = getopt(argc, argv, "o:s:vZdLSj:")) != -1) {
136135045Ssobomax		switch(opt) {
137135045Ssobomax		case 'o':
138135045Ssobomax			oname = optarg;
139135045Ssobomax			break;
140135045Ssobomax
141135045Ssobomax		case 's':
142135045Ssobomax			tmp = atoi(optarg);
143135045Ssobomax			if (tmp <= 0) {
144135045Ssobomax				errx(1, "invalid cluster size specified: %s",
145135045Ssobomax				    optarg);
146135045Ssobomax				/* Not reached */
147135045Ssobomax			}
148303095Ssobomax			cfs.blksz = tmp;
149135045Ssobomax			break;
150135045Ssobomax
151135045Ssobomax		case 'v':
152303095Ssobomax			cfs.verbose = 1;
153135045Ssobomax			break;
154135045Ssobomax
155303095Ssobomax		case 'Z':
156303095Ssobomax			cfs.no_zcomp = 1;
157303095Ssobomax			break;
158303095Ssobomax
159303095Ssobomax		case 'd':
160303095Ssobomax			cfs.en_dedup = 1;
161303095Ssobomax			break;
162303095Ssobomax
163303095Ssobomax		case 'L':
164303095Ssobomax			cfs.handler = &ulzma_fmt;
165303095Ssobomax			break;
166303095Ssobomax
167303095Ssobomax		case 'S':
168303095Ssobomax			summary.en = 1;
169303095Ssobomax			summary.f = stdout;
170303095Ssobomax			break;
171303095Ssobomax
172303095Ssobomax		case 'j':
173303095Ssobomax			tmp = atoi(optarg);
174303095Ssobomax			if (tmp <= 0) {
175303095Ssobomax				errx(1, "invalid number of compression threads"
176303095Ssobomax                                    " specified: %s", optarg);
177303095Ssobomax				/* Not reached */
178303095Ssobomax			}
179303095Ssobomax			cfs.nworkers = tmp;
180303095Ssobomax			break;
181303095Ssobomax
182135045Ssobomax		default:
183135045Ssobomax			usage();
184135045Ssobomax			/* Not reached */
185135045Ssobomax		}
186135045Ssobomax	}
187135045Ssobomax	argc -= optind;
188135045Ssobomax	argv += optind;
189135045Ssobomax
190135045Ssobomax	if (argc != 1) {
191135045Ssobomax		usage();
192135045Ssobomax		/* Not reached */
193135045Ssobomax	}
194135045Ssobomax
195303095Ssobomax	strcpy(hdr.magic, cfs.handler->magic);
196303095Ssobomax
197303095Ssobomax	if (cfs.en_dedup != 0) {
198303095Ssobomax		hdr.magic[CLOOP_OFS_VERSN] = CLOOP_MAJVER_3;
199303095Ssobomax		hdr.magic[CLOOP_OFS_COMPR] =
200303095Ssobomax		    tolower(hdr.magic[CLOOP_OFS_COMPR]);
201303095Ssobomax	}
202303095Ssobomax
203303095Ssobomax	c_ctx = cfs.handler->f_init(cfs.blksz);
204303095Ssobomax
205135045Ssobomax	iname = argv[0];
206135045Ssobomax	if (oname == NULL) {
207303095Ssobomax		asprintf(&oname, "%s%s", iname, cfs.handler->default_sufx);
208135045Ssobomax		if (oname == NULL) {
209135045Ssobomax			err(1, "can't allocate memory");
210135045Ssobomax			/* Not reached */
211135045Ssobomax		}
212135045Ssobomax	}
213135045Ssobomax
214135045Ssobomax	signal(SIGHUP, exit);
215135045Ssobomax	signal(SIGINT, exit);
216135045Ssobomax	signal(SIGTERM, exit);
217135045Ssobomax	signal(SIGXCPU, exit);
218135045Ssobomax	signal(SIGXFSZ, exit);
219135045Ssobomax	atexit(cleanup);
220135045Ssobomax
221303095Ssobomax	cfs.fdr = open(iname, O_RDONLY);
222303095Ssobomax	if (cfs.fdr < 0) {
223167272Sfjoe		err(1, "open(%s)", iname);
224135045Ssobomax		/* Not reached */
225135045Ssobomax	}
226303095Ssobomax	if (fstat(cfs.fdr, &sb) != 0) {
227167272Sfjoe		err(1, "fstat(%s)", iname);
228167272Sfjoe		/* Not reached */
229167272Sfjoe	}
230167272Sfjoe	if (S_ISCHR(sb.st_mode)) {
231167272Sfjoe		off_t ms;
232167272Sfjoe
233303095Ssobomax		if (ioctl(cfs.fdr, DIOCGMEDIASIZE, &ms) < 0) {
234167272Sfjoe			err(1, "ioctl(DIOCGMEDIASIZE)");
235167272Sfjoe			/* Not reached */
236167272Sfjoe		}
237167272Sfjoe		sb.st_size = ms;
238167272Sfjoe	} else if (!S_ISREG(sb.st_mode)) {
239167272Sfjoe		fprintf(stderr, "%s: not a character device or regular file\n",
240167272Sfjoe			iname);
241167272Sfjoe		exit(1);
242167272Sfjoe	}
243303095Ssobomax	hdr.nblocks = sb.st_size / cfs.blksz;
244303095Ssobomax	if ((sb.st_size % cfs.blksz) != 0) {
245303095Ssobomax		if (cfs.verbose != 0)
246135058Ssobomax			fprintf(stderr, "file size is not multiple "
247303095Ssobomax			"of %d, padding data\n", cfs.blksz);
248135058Ssobomax		hdr.nblocks++;
249135045Ssobomax	}
250303095Ssobomax	toc = mkuz_safe_malloc((hdr.nblocks + 1) * sizeof(*toc));
251135045Ssobomax
252303095Ssobomax	cfs.fdw = open(oname, (cfs.en_dedup ? O_RDWR : O_WRONLY) | O_TRUNC | O_CREAT,
253146107Sfjoe		   S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
254303095Ssobomax	if (cfs.fdw < 0) {
255155074Spjd		err(1, "open(%s)", oname);
256135045Ssobomax		/* Not reached */
257135045Ssobomax	}
258135045Ssobomax	cleanfile = oname;
259135045Ssobomax
260135045Ssobomax	/* Prepare header that we will write later when we have index ready. */
261135045Ssobomax	iov[0].iov_base = (char *)&hdr;
262135045Ssobomax	iov[0].iov_len = sizeof(hdr);
263135045Ssobomax	iov[1].iov_base = (char *)toc;
264135045Ssobomax	iov[1].iov_len = (hdr.nblocks + 1) * sizeof(*toc);
265135045Ssobomax	offset = iov[0].iov_len + iov[1].iov_len;
266135045Ssobomax
267135045Ssobomax	/* Reserve space for header */
268303095Ssobomax	lseek(cfs.fdw, offset, SEEK_SET);
269135045Ssobomax
270303095Ssobomax	if (cfs.verbose != 0) {
271145808Ssobomax		fprintf(stderr, "data size %ju bytes, number of clusters "
272146107Sfjoe		    "%u, index length %zu bytes\n", sb.st_size,
273135058Ssobomax		    hdr.nblocks, iov[1].iov_len);
274303095Ssobomax	}
275135045Ssobomax
276303095Ssobomax	cvp = mkuz_conveyor_ctor(&cfs);
277303095Ssobomax
278303095Ssobomax	last_offset = 0;
279303095Ssobomax        iblk = oblk = NULL;
280303095Ssobomax	for(i = io = 0; iblk != MKUZ_BLK_EOF; i++) {
281303095Ssobomax		iblk = readblock(cfs.fdr, cfs.blksz);
282303095Ssobomax		mkuz_fqueue_enq(cvp->wrk_queue, iblk);
283303095Ssobomax		if (iblk != MKUZ_BLK_EOF &&
284303095Ssobomax		    (i < (cfs.nworkers * ITEMS_PER_WORKER))) {
285303095Ssobomax			continue;
286303095Ssobomax		}
287303095Ssobomaxdrain:
288303095Ssobomax		oblk = mkuz_fqueue_deq_when(cvp->results, cmp_blkno, &io);
289303095Ssobomax		assert(oblk->info.blkno == (unsigned)io);
290303095Ssobomax		oblk->info.offset = offset;
291303095Ssobomax		chit = NULL;
292303095Ssobomax		if (cfs.en_dedup != 0 && oblk->info.len > 0) {
293303095Ssobomax			chit = mkuz_blkcache_regblock(cfs.fdw, oblk);
294303095Ssobomax			/*
295303095Ssobomax			 * There should be at least one non-empty block
296303095Ssobomax			 * between us and the backref'ed offset, otherwise
297303095Ssobomax			 * we won't be able to parse that sequence correctly
298303095Ssobomax			 * as it would be indistinguishible from another
299303095Ssobomax			 * empty block.
300303095Ssobomax			 */
301303095Ssobomax			if (chit != NULL && chit->offset == last_offset) {
302303095Ssobomax				chit = NULL;
303303095Ssobomax			}
304303095Ssobomax		}
305303095Ssobomax		if (chit != NULL) {
306303095Ssobomax			toc[io] = htobe64(chit->offset);
307303095Ssobomax			oblk->info.len = 0;
308303095Ssobomax		} else {
309303095Ssobomax			if (oblk->info.len > 0 && write(cfs.fdw, oblk->data,
310303095Ssobomax			    oblk->info.len) < 0) {
311303095Ssobomax				err(1, "write(%s)", oname);
312135045Ssobomax				/* Not reached */
313135045Ssobomax			}
314303095Ssobomax			toc[io] = htobe64(offset);
315303095Ssobomax			last_offset = offset;
316303095Ssobomax			offset += oblk->info.len;
317135045Ssobomax		}
318303095Ssobomax		if (cfs.verbose != 0) {
319303095Ssobomax			fprintf(stderr, "cluster #%d, in %u bytes, "
320303095Ssobomax			    "out len=%lu offset=%lu", io, cfs.blksz,
321303095Ssobomax			    (u_long)oblk->info.len, (u_long)be64toh(toc[io]));
322303095Ssobomax			if (chit != NULL) {
323303095Ssobomax				fprintf(stderr, " (backref'ed to #%d)",
324303095Ssobomax				    chit->blkno);
325303095Ssobomax			}
326303095Ssobomax			fprintf(stderr, "\n");
327135045Ssobomax		}
328303095Ssobomax		free(oblk);
329303095Ssobomax		io += 1;
330303095Ssobomax		if (iblk == MKUZ_BLK_EOF) {
331303095Ssobomax			if (io < i)
332303095Ssobomax				goto drain;
333303095Ssobomax			/* Last block, see if we need to add some padding */
334303095Ssobomax			if ((offset % DEV_BSIZE) == 0)
335303095Ssobomax				continue;
336303095Ssobomax			oblk = mkuz_blk_ctor(DEV_BSIZE - (offset % DEV_BSIZE));
337303095Ssobomax			oblk->info.blkno = io;
338303095Ssobomax			oblk->info.len = oblk->alen;
339303095Ssobomax			if (cfs.verbose != 0) {
340303095Ssobomax				fprintf(stderr, "padding data with %lu bytes "
341303095Ssobomax				    "so that file size is multiple of %d\n",
342303095Ssobomax				    (u_long)oblk->alen, DEV_BSIZE);
343303095Ssobomax			}
344303095Ssobomax			mkuz_fqueue_enq(cvp->results, oblk);
345303095Ssobomax			goto drain;
346303095Ssobomax		}
347135045Ssobomax	}
348135045Ssobomax
349303095Ssobomax	close(cfs.fdr);
350135045Ssobomax
351303095Ssobomax	if (cfs.verbose != 0 || summary.en != 0) {
352303095Ssobomax		et = getdtime();
353303095Ssobomax		fprintf(summary.f, "compressed data to %ju bytes, saved %lld "
354303095Ssobomax		    "bytes, %.2f%% decrease, %.2f bytes/sec.\n", offset,
355303095Ssobomax		    (long long)(sb.st_size - offset),
356303095Ssobomax		    100.0 * (long long)(sb.st_size - offset) /
357303095Ssobomax		    (float)sb.st_size, (float)sb.st_size / (et - st));
358303095Ssobomax	}
359303095Ssobomax
360135045Ssobomax	/* Convert to big endian */
361303095Ssobomax	hdr.blksz = htonl(cfs.blksz);
362135045Ssobomax	hdr.nblocks = htonl(hdr.nblocks);
363135045Ssobomax	/* Write headers into pre-allocated space */
364303095Ssobomax	lseek(cfs.fdw, 0, SEEK_SET);
365303095Ssobomax	if (writev(cfs.fdw, iov, 2) < 0) {
366155074Spjd		err(1, "writev(%s)", oname);
367135045Ssobomax		/* Not reached */
368135045Ssobomax	}
369135045Ssobomax	cleanfile = NULL;
370303095Ssobomax	close(cfs.fdw);
371135045Ssobomax
372135045Ssobomax	exit(0);
373135045Ssobomax}
374135045Ssobomax
375303095Ssobomaxstatic struct mkuz_blk *
376303095Ssobomaxreadblock(int fd, u_int32_t clstsize)
377135058Ssobomax{
378135045Ssobomax	int numread;
379303095Ssobomax	struct mkuz_blk *rval;
380303095Ssobomax	static int blockcnt;
381303095Ssobomax	off_t cpos;
382135045Ssobomax
383303095Ssobomax	rval = mkuz_blk_ctor(clstsize);
384303095Ssobomax
385303095Ssobomax	rval->info.blkno = blockcnt;
386303095Ssobomax	blockcnt += 1;
387303095Ssobomax	cpos = lseek(fd, 0, SEEK_CUR);
388303095Ssobomax	if (cpos < 0) {
389303095Ssobomax		err(1, "readblock: lseek() failed");
390303095Ssobomax		/* Not reached */
391303095Ssobomax	}
392303095Ssobomax	rval->info.offset = cpos;
393303095Ssobomax
394303095Ssobomax	numread = read(fd, rval->data, clstsize);
395135045Ssobomax	if (numread < 0) {
396303095Ssobomax		err(1, "readblock: read() failed");
397135045Ssobomax		/* Not reached */
398135045Ssobomax	}
399135045Ssobomax	if (numread == 0) {
400303095Ssobomax		free(rval);
401303095Ssobomax		return MKUZ_BLK_EOF;
402135045Ssobomax	}
403303095Ssobomax	rval->info.len = numread;
404303095Ssobomax	return rval;
405135045Ssobomax}
406135045Ssobomax
407135045Ssobomaxstatic void
408135058Ssobomaxusage(void)
409135058Ssobomax{
410135045Ssobomax
411303095Ssobomax	fprintf(stderr, "usage: mkuzip [-vZdLS] [-o outfile] [-s cluster_size] "
412303095Ssobomax	    "[-j ncompr] infile\n");
413135045Ssobomax	exit(1);
414135045Ssobomax}
415135045Ssobomax
416303095Ssobomaxvoid *
417303095Ssobomaxmkuz_safe_malloc(size_t size)
418135058Ssobomax{
419135045Ssobomax	void *retval;
420135045Ssobomax
421135045Ssobomax	retval = malloc(size);
422135045Ssobomax	if (retval == NULL) {
423135045Ssobomax		err(1, "can't allocate memory");
424135045Ssobomax		/* Not reached */
425135045Ssobomax	}
426135045Ssobomax	return retval;
427135045Ssobomax}
428135045Ssobomax
429303095Ssobomaxvoid *
430303095Ssobomaxmkuz_safe_zmalloc(size_t size)
431303095Ssobomax{
432303095Ssobomax	void *retval;
433303095Ssobomax
434303095Ssobomax	retval = mkuz_safe_malloc(size);
435303095Ssobomax	bzero(retval, size);
436303095Ssobomax	return retval;
437303095Ssobomax}
438303095Ssobomax
439135045Ssobomaxstatic void
440135058Ssobomaxcleanup(void)
441135058Ssobomax{
442135045Ssobomax
443135045Ssobomax	if (cleanfile != NULL)
444135045Ssobomax		unlink(cleanfile);
445135045Ssobomax}
446303095Ssobomax
447303095Ssobomaxint
448303095Ssobomaxmkuz_memvcmp(const void *memory, unsigned char val, size_t size)
449303095Ssobomax{
450303095Ssobomax    const u_char *mm;
451303095Ssobomax
452303095Ssobomax    mm = (const u_char *)memory;
453303095Ssobomax    return (*mm == val) && memcmp(mm, mm + 1, size - 1) == 0;
454303095Ssobomax}
455