unzip.c revision 180124
1/*-
2 * Copyright (c) 2007-2008 Dag-Erling Co�dan Sm�rgrav
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer
10 *    in this position and unchanged.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 *
27 * $FreeBSD: head/usr.bin/unzip/unzip.c 180124 2008-06-30 17:11:27Z des $
28 *
29 * This file would be much shorter if we didn't care about command-line
30 * compatibility with Info-ZIP's UnZip, which requires us to duplicate
31 * parts of libarchive in order to gain more detailed control of its
32 * behaviour for the purpose of implementing the -n, -o, -L and -a
33 * options.
34 */
35
36#include <sys/queue.h>
37#include <sys/stat.h>
38
39#include <ctype.h>
40#include <errno.h>
41#include <fcntl.h>
42#include <fnmatch.h>
43#include <stdarg.h>
44#include <stdio.h>
45#include <stdlib.h>
46#include <string.h>
47#include <unistd.h>
48
49#include <archive.h>
50#include <archive_entry.h>
51
52/* command-line options */
53static int		 a_opt;		/* convert EOL */
54static const char	*d_arg;		/* directory */
55static int		 j_opt;		/* junk directories */
56static int		 L_opt;		/* lowercase names */
57static int		 l_opt;		/* list */
58static int		 n_opt;		/* never overwrite */
59static int		 o_opt;		/* always overwrite */
60static int		 q_opt;		/* quiet */
61static int		 t_opt;		/* test */
62static int		 u_opt;		/* update */
63
64/* time when unzip started */
65static time_t		 now;
66
67/* debug flag */
68static int		 unzip_debug;
69
70/* running on tty? */
71static int		 tty;
72
73/* error flag for -t */
74static int		 test_failed;
75
76/* convenience macro */
77/* XXX should differentiate between ARCHIVE_{WARN,FAIL,RETRY} */
78#define ac(call)						\
79	do {							\
80		int acret = (call);				\
81		if (acret != ARCHIVE_OK)			\
82			errorx("%s", archive_error_string(a));	\
83	} while (0)
84
85/*
86 * Indicates that last info() did not end with EOL.  This helps error() et
87 * al. avoid printing an error message on the same line as an incomplete
88 * informational message.
89 */
90static int noeol;
91
92/* fatal error message + errno */
93static void
94error(const char *fmt, ...)
95{
96	va_list ap;
97
98	if (noeol)
99		fprintf(stdout, "\n");
100	fflush(stdout);
101	fprintf(stderr, "unzip: ");
102	va_start(ap, fmt);
103	vfprintf(stderr, fmt, ap);
104	va_end(ap);
105	fprintf(stderr, ": %s\n", strerror(errno));
106	exit(1);
107}
108
109/* fatal error message, no errno */
110static void
111errorx(const char *fmt, ...)
112{
113	va_list ap;
114
115	if (noeol)
116		fprintf(stdout, "\n");
117	fflush(stdout);
118	fprintf(stderr, "unzip: ");
119	va_start(ap, fmt);
120	vfprintf(stderr, fmt, ap);
121	va_end(ap);
122	fprintf(stderr, "\n");
123	exit(1);
124}
125
126#if 0
127/* non-fatal error message + errno */
128static void
129warning(const char *fmt, ...)
130{
131	va_list ap;
132
133	if (noeol)
134		fprintf(stdout, "\n");
135	fflush(stdout);
136	fprintf(stderr, "unzip: ");
137	va_start(ap, fmt);
138	vfprintf(stderr, fmt, ap);
139	va_end(ap);
140	fprintf(stderr, ": %s\n", strerror(errno));
141}
142#endif
143
144/* non-fatal error message, no errno */
145static void
146warningx(const char *fmt, ...)
147{
148	va_list ap;
149
150	if (noeol)
151		fprintf(stdout, "\n");
152	fflush(stdout);
153	fprintf(stderr, "unzip: ");
154	va_start(ap, fmt);
155	vfprintf(stderr, fmt, ap);
156	va_end(ap);
157	fprintf(stderr, "\n");
158}
159
160/* informational message (if not -q) */
161static void
162info(const char *fmt, ...)
163{
164	va_list ap;
165	int i;
166
167	if (q_opt && !unzip_debug)
168		return;
169	va_start(ap, fmt);
170	vfprintf(stdout, fmt, ap);
171	va_end(ap);
172	fflush(stdout);
173
174	for (i = 0; fmt[i] != '\0'; ++i)
175		/* nothing */ ;
176	noeol = !(i && fmt[i - 1] == '\n');
177}
178
179/* debug message (if unzip_debug) */
180static void
181debug(const char *fmt, ...)
182{
183	va_list ap;
184	int i;
185
186	if (!unzip_debug)
187		return;
188	va_start(ap, fmt);
189	vfprintf(stderr, fmt, ap);
190	va_end(ap);
191	fflush(stderr);
192
193	for (i = 0; fmt[i] != '\0'; ++i)
194		/* nothing */ ;
195	noeol = !(i && fmt[i - 1] == '\n');
196}
197
198/* duplicate a path name, possibly converting to lower case */
199static char *
200pathdup(const char *path)
201{
202	char *str;
203	int len;
204
205	len = strlen(path);
206	while (len && path[len - 1] == '/')
207		len--;
208	if ((str = malloc(len + 1)) == NULL) {
209		errno = ENOMEM;
210		error("malloc()");
211	}
212	for (int i = 0; i < len; ++i)
213		str[i] = L_opt ? tolower(path[i]) : path[i];
214	str[len] = '\0';
215
216	return (str);
217}
218
219/* concatenate two path names */
220static char *
221pathcat(const char *prefix, const char *path)
222{
223	char *str;
224	int prelen, len;
225
226	prelen = prefix ? strlen(prefix) + 1 : 0;
227	len = strlen(path) + 1;
228	if ((str = malloc(prelen + len)) == NULL) {
229		errno = ENOMEM;
230		error("malloc()");
231	}
232	if (prefix) {
233		memcpy(str, prefix, prelen);	/* includes zero */
234		str[prelen - 1] = '/';		/* splat zero */
235	}
236	memcpy(str + prelen, path, len);	/* includes zero */
237
238	return (str);
239}
240
241/*
242 * Pattern lists for include / exclude processing
243 */
244struct pattern {
245	STAILQ_ENTRY(pattern) link;
246	char pattern[];
247};
248
249STAILQ_HEAD(pattern_list, pattern);
250static struct pattern_list include = STAILQ_HEAD_INITIALIZER(include);
251static struct pattern_list exclude = STAILQ_HEAD_INITIALIZER(exclude);
252
253/*
254 * Add an entry to a pattern list
255 */
256static void
257add_pattern(struct pattern_list *list, const char *pattern)
258{
259	struct pattern *entry;
260	int len;
261
262	debug("adding pattern '%s'\n", pattern);
263	len = strlen(pattern);
264	if ((entry = malloc(sizeof *entry + len + 1)) == NULL) {
265		errno = ENOMEM;
266		error("malloc()");
267	}
268	memset(&entry->link, 0, sizeof entry->link);
269	memcpy(entry->pattern, pattern, len + 1);
270	STAILQ_INSERT_TAIL(list, entry, link);
271}
272
273/*
274 * Match a string against a list of patterns
275 */
276static int
277match_pattern(struct pattern_list *list, const char *str)
278{
279	struct pattern *entry;
280
281	STAILQ_FOREACH(entry, list, link) {
282		if (fnmatch(entry->pattern, str, 0) == 0)
283			return (1);
284	}
285	return (0);
286}
287
288/*
289 * Verify that a given pathname is in the include list and not in the
290 * exclude list.
291 */
292static int
293accept_pathname(const char *pathname)
294{
295
296	if (!STAILQ_EMPTY(&include) && !match_pattern(&include, pathname))
297		return (0);
298	if (!STAILQ_EMPTY(&exclude) && match_pattern(&exclude, pathname))
299		return (0);
300	return (1);
301}
302
303/*
304 * Create the specified directory with the specified mode, taking certain
305 * precautions on they way.
306 */
307static void
308make_dir(const char *path, int mode)
309{
310	struct stat sb;
311
312	if (lstat(path, &sb) == 0) {
313		if (S_ISDIR(sb.st_mode))
314			return;
315		/*
316		 * Normally, we should either ask the user about removing
317		 * the non-directory of the same name as a directory we
318		 * wish to create, or respect the -n or -o command-line
319		 * options.  However, this may lead to a later failure or
320		 * even compromise (if this non-directory happens to be a
321		 * symlink to somewhere unsafe), so we don't.
322		 */
323
324		/*
325		 * Don't check unlink() result; failure will cause mkdir()
326		 * to fail later, which we will catch.
327		 */
328		(void)unlink(path);
329	}
330	if (mkdir(path, mode) != 0 && errno != EEXIST)
331		error("mkdir('%s')", path);
332}
333
334/*
335 * Ensure that all directories leading up to (but not including) the
336 * specified path exist.
337 *
338 * XXX inefficient + modifies the file in-place
339 */
340static void
341make_parent(char *path)
342{
343	struct stat sb;
344	char *sep;
345
346	sep = strrchr(path, '/');
347	if (sep == NULL || sep == path)
348		return;
349	*sep = '\0';
350	if (lstat(path, &sb) == 0) {
351		if (S_ISDIR(sb.st_mode)) {
352			*sep = '/';
353			return;
354		}
355		unlink(path);
356	}
357	make_parent(path);
358	mkdir(path, 0755);
359	*sep = '/';
360
361#if 0
362	for (sep = path; (sep = strchr(sep, '/')) != NULL; sep++) {
363		/* root in case of absolute d_arg */
364		if (sep == path)
365			continue;
366		*sep = '\0';
367		make_dir(path, 0755);
368		*sep = '/';
369	}
370#endif
371}
372
373/*
374 * Extract a directory.
375 */
376static void
377extract_dir(struct archive *a, struct archive_entry *e, const char *path)
378{
379	int mode;
380
381	mode = archive_entry_filetype(e) & 0777;
382	if (mode == 0)
383		mode = 0755;
384
385	/*
386	 * Some zipfiles contain directories with weird permissions such
387	 * as 0644 or 0444.  This can cause strange issues such as being
388	 * unable to extract files into the directory we just created, or
389	 * the user being unable to remove the directory later without
390	 * first manually changing its permissions.  Therefore, we whack
391	 * the permissions into shape, assuming that the user wants full
392	 * access and that anyone who gets read access also gets execute
393	 * access.
394	 */
395	mode |= 0700;
396	if (mode & 0040)
397		mode |= 0010;
398	if (mode & 0004)
399		mode |= 0001;
400
401	info("d %s\n", path);
402	make_dir(path, mode);
403	ac(archive_read_data_skip(a));
404}
405
406static unsigned char buffer[8192];
407static char spinner[] = { '|', '/', '-', '\\' };
408
409/*
410 * Extract a regular file.
411 */
412static void
413extract_file(struct archive *a, struct archive_entry *e, const char *path)
414{
415	int mode;
416	time_t mtime;
417	struct stat sb;
418	struct timeval tv[2];
419	int cr, fd, text, warn;
420	ssize_t len;
421	unsigned char *p, *q, *end;
422
423	mode = archive_entry_filetype(e) & 0777;
424	if (mode == 0)
425		mode = 0644;
426	mtime = archive_entry_mtime(e);
427
428	/* look for existing file of same name */
429	if (lstat(path, &sb) == 0) {
430		if (u_opt) {
431			/* check if up-to-date */
432			if (S_ISREG(sb.st_mode) && sb.st_mtime > mtime)
433				return;
434			(void)unlink(path);
435		} else if (o_opt) {
436			/* overwrite */
437			(void)unlink(path);
438		} else if (n_opt) {
439			/* do not overwrite */
440			return;
441		} else {
442			/* XXX ask user */
443			errorx("not implemented");
444		}
445	}
446
447	if ((fd = open(path, O_RDWR|O_CREAT|O_TRUNC, mode)) < 0)
448		error("open('%s')", path);
449
450	/* loop over file contents and write to disk */
451	info("x %s", path);
452	text = a_opt;
453	warn = 0;
454	cr = 0;
455	for (int n = 0; ; n++) {
456		if (tty && (n % 4) == 0)
457			info(" %c\b\b", spinner[(n / 4) % sizeof spinner]);
458
459		len = archive_read_data(a, buffer, sizeof buffer);
460
461		if (len < 0)
462			ac(len);
463
464		/* left over CR from previous buffer */
465		if (a_opt && cr) {
466			if (len == 0 || buffer[0] != '\n')
467				if (write(fd, "\r", 1) != 1)
468					error("write('%s')", path);
469			cr = 0;
470		}
471
472		/* EOF */
473		if (len == 0)
474			break;
475		end = buffer + len;
476
477		/*
478		 * Detect whether this is a text file.  The correct way to
479		 * do this is to check the least significant bit of the
480		 * "internal file attributes" field of the corresponding
481		 * file header in the central directory, but libarchive
482		 * does not read the central directory, so we have to
483		 * guess by looking for non-ASCII characters in the
484		 * buffer.  Hopefully we won't guess wrong.  If we do
485		 * guess wrong, we print a warning message later.
486		 */
487		if (a_opt && n == 0) {
488			for (p = buffer; p < end; ++p) {
489				if (!isascii((unsigned char)*p)) {
490					text = 0;
491					break;
492				}
493			}
494		}
495
496		/* simple case */
497		if (!a_opt || !text) {
498			if (write(fd, buffer, len) != len)
499				error("write('%s')", path);
500			continue;
501		}
502
503		/* hard case: convert \r\n to \n (sigh...) */
504		for (p = buffer; p < end; p = q + 1) {
505			for (q = p; q < end; q++) {
506				if (!warn && !isascii(*q)) {
507					warningx("%s may be corrupted due"
508					    " to weak text file detection"
509					    " heuristic", path);
510					warn = 1;
511				}
512				if (q[0] != '\r')
513					continue;
514				if (&q[1] == end) {
515					cr = 1;
516					break;
517				}
518				if (q[1] == '\n')
519					break;
520			}
521			if (write(fd, p, q - p) != q - p)
522				error("write('%s')", path);
523		}
524	}
525	if (tty)
526		info("  \b\b");
527	if (text)
528		info(" (text)");
529	info("\n");
530
531	/* set access and modification time */
532	tv[0].tv_sec = now;
533	tv[0].tv_usec = 0;
534	tv[1].tv_sec = mtime;
535	tv[1].tv_usec = 0;
536	if (futimes(fd, tv) != 0)
537		error("utimes('%s')", path);
538	if (close(fd) != 0)
539		error("close('%s')", path);
540}
541
542/*
543 * Extract a zipfile entry: first perform some sanity checks to ensure
544 * that it is either a directory or a regular file and that the path is
545 * not absolute and does not try to break out of the current directory;
546 * then call either extract_dir() or extract_file() as appropriate.
547 *
548 * This is complicated a bit by the various ways in which we need to
549 * manipulate the path name.  Case conversion (if requested by the -L
550 * option) happens first, but the include / exclude patterns are applied
551 * to the full converted path name, before the directory part of the path
552 * is removed in accordance with the -j option.  Sanity checks are
553 * intentionally done earlier than they need to be, so the user will get a
554 * warning about insecure paths even for files or directories which
555 * wouldn't be extracted anyway.
556 */
557static void
558extract(struct archive *a, struct archive_entry *e)
559{
560	char *pathname, *realpathname;
561	mode_t filetype;
562	char *p, *q;
563
564	pathname = pathdup(archive_entry_pathname(e));
565	filetype = archive_entry_filetype(e);
566
567	/* sanity checks */
568	if (pathname[0] == '/' ||
569	    strncmp(pathname, "../", 3) == 0 ||
570	    strstr(pathname, "/../") != NULL) {
571		warningx("skipping insecure entry '%s'", pathname);
572		ac(archive_read_data_skip(a));
573		free(pathname);
574		return;
575	}
576
577	/* I don't think this can happen in a zipfile.. */
578	if (!S_ISDIR(filetype) && !S_ISREG(filetype)) {
579		warningx("skipping non-regular entry '%s'", pathname);
580		ac(archive_read_data_skip(a));
581		free(pathname);
582		return;
583	}
584
585	/* skip directories in -j case */
586	if (S_ISDIR(filetype) && j_opt) {
587		ac(archive_read_data_skip(a));
588		free(pathname);
589		return;
590	}
591
592	/* apply include / exclude patterns */
593	if (!accept_pathname(pathname)) {
594		ac(archive_read_data_skip(a));
595		free(pathname);
596		return;
597	}
598
599	/* apply -j and -d */
600	if (j_opt) {
601		for (p = q = pathname; *p; ++p)
602			if (*p == '/')
603				q = p + 1;
604		realpathname = pathcat(d_arg, q);
605	} else {
606		realpathname = pathcat(d_arg, pathname);
607	}
608
609	/* ensure that parent directory exists */
610	make_parent(realpathname);
611
612	if (S_ISDIR(filetype))
613		extract_dir(a, e, realpathname);
614	else
615		extract_file(a, e, realpathname);
616
617	free(realpathname);
618	free(pathname);
619}
620
621/*
622 * Print the name of an entry to stdout.
623 */
624static void
625list(struct archive *a, struct archive_entry *e)
626{
627
628	printf("%s\n", archive_entry_pathname(e));
629	ac(archive_read_data_skip(a));
630}
631
632/*
633 * Extract to memory to check CRC
634 */
635static void
636test(struct archive *a, struct archive_entry *e)
637{
638	ssize_t len;
639
640	if (S_ISDIR(archive_entry_filetype(e)))
641		return;
642
643	info("%s ", archive_entry_pathname(e));
644	while ((len = archive_read_data(a, buffer, sizeof buffer)) > 0)
645		/* nothing */;
646	if (len < 0) {
647		info("%s\n", archive_error_string(a));
648		++test_failed;
649	} else {
650		info("OK\n");
651	}
652
653	/* shouldn't be necessary, but it doesn't hurt */
654	ac(archive_read_data_skip(a));
655}
656
657
658/*
659 * Main loop: open the zipfile, iterate over its contents and decide what
660 * to do with each entry.
661 */
662static void
663unzip(const char *fn)
664{
665	struct archive *a;
666	struct archive_entry *e;
667	int fd, ret;
668
669	if ((fd = open(fn, O_RDONLY)) < 0)
670		error("%s", fn);
671
672	a = archive_read_new();
673	ac(archive_read_support_format_zip(a));
674	ac(archive_read_open_fd(a, fd, 8192));
675
676	for (;;) {
677		ret = archive_read_next_header(a, &e);
678		if (ret == ARCHIVE_EOF)
679			break;
680		ac(ret);
681		if (t_opt)
682			test(a, e);
683		else if (l_opt)
684			list(a, e);
685		else
686			extract(a, e);
687	}
688
689	ac(archive_read_close(a));
690	(void)archive_read_finish(a);
691	if (close(fd) != 0)
692		error("%s", fn);
693
694	if (t_opt && test_failed)
695		errorx("%d checksum error(s) found.", test_failed);
696}
697
698static void
699usage(void)
700{
701
702	fprintf(stderr, "usage: unzip [-ajLlnoqtu] [-d dir] zipfile\n");
703	exit(1);
704}
705
706static int
707getopts(int argc, char *argv[])
708{
709	int opt;
710
711	optreset = optind = 1;
712	while ((opt = getopt(argc, argv, "ad:jLlnoqtux:")) != -1)
713		switch (opt) {
714		case 'a':
715			a_opt = 1;
716			break;
717		case 'd':
718			d_arg = optarg;
719			break;
720		case 'j':
721			j_opt = 1;
722			break;
723		case 'L':
724			L_opt = 1;
725			break;
726		case 'l':
727			l_opt = 1;
728			break;
729		case 'n':
730			n_opt = 1;
731			break;
732		case 'o':
733			o_opt = 1;
734			break;
735		case 'q':
736			q_opt = 1;
737			break;
738		case 't':
739			t_opt = 1;
740			break;
741		case 'u':
742			u_opt = 1;
743			break;
744		case 'x':
745			add_pattern(&exclude, optarg);
746			break;
747		default:
748			usage();
749		}
750
751	return (optind);
752}
753
754int
755main(int argc, char *argv[])
756{
757	const char *zipfile;
758	int nopts;
759
760	if (isatty(STDOUT_FILENO))
761		tty = 1;
762
763	if (getenv("UNZIP_DEBUG") != NULL)
764		unzip_debug = 1;
765	for (int i = 0; i < argc; ++i)
766		debug("%s%c", argv[i], (i < argc - 1) ? ' ' : '\n');
767
768	/*
769	 * Info-ZIP's unzip(1) expects certain options to come before the
770	 * zipfile name, and others to come after - though it does not
771	 * enforce this.  For simplicity, we accept *all* options both
772	 * before and after the zipfile name.
773	 */
774	nopts = getopts(argc, argv);
775
776	if (argc <= nopts)
777		usage();
778	zipfile = argv[nopts++];
779
780	while (nopts < argc && *argv[nopts] != '-')
781		add_pattern(&include, argv[nopts++]);
782
783	nopts--; /* fake argv[0] */
784	nopts += getopts(argc - nopts, argv + nopts);
785
786	if (n_opt + o_opt + u_opt > 1)
787		errorx("-n, -o and -u are contradictory");
788
789	time(&now);
790
791	unzip(zipfile);
792
793	exit(0);
794}
795