1/*-
2 * Copyright (c) 2014 Juniper Networks, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28#include <sys/mman.h>
29#include <sys/stat.h>
30#include <assert.h>
31#include <err.h>
32#include <errno.h>
33#include <limits.h>
34#include <paths.h>
35#include <stdint.h>
36#include <stdio.h>
37#include <stdlib.h>
38#include <string.h>
39#include <unistd.h>
40
41#include "image.h"
42#include "mkimg.h"
43
44#ifndef MAP_NOCORE
45#define	MAP_NOCORE	0
46#endif
47#ifndef MAP_NOSYNC
48#define	MAP_NOSYNC	0
49#endif
50
51#ifndef SEEK_DATA
52#define	SEEK_DATA	-1
53#endif
54#ifndef SEEK_HOLE
55#define	SEEK_HOLE	-1
56#endif
57
58struct chunk {
59	TAILQ_ENTRY(chunk) ch_list;
60	size_t	ch_size;		/* Size of chunk in bytes. */
61	lba_t	ch_block;		/* Block address in image. */
62	union {
63		struct {
64			off_t	ofs;	/* Offset in backing file. */
65			int	fd;	/* FD of backing file. */
66		} file;
67		struct {
68			void	*ptr;	/* Pointer to data in memory */
69		} mem;
70	} ch_u;
71	u_int	ch_type;
72#define	CH_TYPE_ZEROES		0	/* Chunk is a gap (no data). */
73#define	CH_TYPE_FILE		1	/* File-backed chunk. */
74#define	CH_TYPE_MEMORY		2	/* Memory-backed chunk */
75};
76
77static TAILQ_HEAD(chunk_head, chunk) image_chunks;
78static u_int image_nchunks;
79
80static char image_swap_file[PATH_MAX];
81static int image_swap_fd = -1;
82static u_int image_swap_pgsz;
83static off_t image_swap_size;
84
85static lba_t image_size;
86
87static int
88is_empty_sector(void *buf)
89{
90	uint64_t *p = buf;
91	size_t n, max;
92
93	assert(((uintptr_t)p & 3) == 0);
94
95	max = secsz / sizeof(uint64_t);
96	for (n = 0; n < max; n++) {
97		if (p[n] != 0UL)
98			return (0);
99	}
100	return (1);
101}
102
103/*
104 * Swap file handlng.
105 */
106
107static off_t
108image_swap_alloc(size_t size)
109{
110	off_t ofs;
111	size_t unit;
112
113	unit = (secsz > image_swap_pgsz) ? secsz : image_swap_pgsz;
114	assert((unit & (unit - 1)) == 0);
115
116	size = (size + unit - 1) & ~(unit - 1);
117
118	ofs = image_swap_size;
119	image_swap_size += size;
120	if (ftruncate(image_swap_fd, image_swap_size) == -1) {
121		image_swap_size = ofs;
122		ofs = -1LL;
123	}
124	return (ofs);
125}
126
127/*
128 * Image chunk handling.
129 */
130
131static struct chunk *
132image_chunk_find(lba_t blk)
133{
134	static struct chunk *last = NULL;
135	struct chunk *ch;
136
137	ch = (last != NULL && last->ch_block <= blk)
138	    ? last : TAILQ_FIRST(&image_chunks);
139	while (ch != NULL) {
140		if (ch->ch_block <= blk &&
141		    (lba_t)(ch->ch_block + (ch->ch_size / secsz)) > blk) {
142			last = ch;
143			break;
144		}
145		ch = TAILQ_NEXT(ch, ch_list);
146	}
147	return (ch);
148}
149
150static size_t
151image_chunk_grow(struct chunk *ch, size_t sz)
152{
153	size_t dsz, newsz;
154
155	newsz = ch->ch_size + sz;
156	if (newsz > ch->ch_size) {
157		ch->ch_size = newsz;
158		return (0);
159	}
160	/* We would overflow -- create new chunk for remainder. */
161	dsz = SIZE_MAX - ch->ch_size;
162	assert(dsz < sz);
163	ch->ch_size = SIZE_MAX;
164	return (sz - dsz);
165}
166
167static struct chunk *
168image_chunk_memory(struct chunk *ch, lba_t blk)
169{
170	struct chunk *new;
171	void *ptr;
172
173	ptr = calloc(1, secsz);
174	if (ptr == NULL)
175		return (NULL);
176
177	if (ch->ch_block < blk) {
178		new = malloc(sizeof(*new));
179		if (new == NULL) {
180			free(ptr);
181			return (NULL);
182		}
183		memcpy(new, ch, sizeof(*new));
184		ch->ch_size = (blk - ch->ch_block) * secsz;
185		new->ch_block = blk;
186		new->ch_size -= ch->ch_size;
187		TAILQ_INSERT_AFTER(&image_chunks, ch, new, ch_list);
188		image_nchunks++;
189		ch = new;
190	}
191
192	if (ch->ch_size > secsz) {
193		new = malloc(sizeof(*new));
194		if (new == NULL) {
195			free(ptr);
196			return (NULL);
197		}
198		memcpy(new, ch, sizeof(*new));
199		ch->ch_size = secsz;
200		new->ch_block++;
201		new->ch_size -= secsz;
202		TAILQ_INSERT_AFTER(&image_chunks, ch, new, ch_list);
203		image_nchunks++;
204	}
205
206	ch->ch_type = CH_TYPE_MEMORY;
207	ch->ch_u.mem.ptr = ptr;
208	return (ch);
209}
210
211static int
212image_chunk_skipto(lba_t to)
213{
214	struct chunk *ch;
215	lba_t from;
216	size_t sz;
217
218	ch = TAILQ_LAST(&image_chunks, chunk_head);
219	from = (ch != NULL) ? ch->ch_block + (ch->ch_size / secsz) : 0LL;
220
221	assert(from <= to);
222
223	/* Nothing to do? */
224	if (from == to)
225		return (0);
226	/* Avoid bugs due to overflows. */
227	if ((uintmax_t)(to - from) > (uintmax_t)(SIZE_MAX / secsz))
228		return (EFBIG);
229	sz = (to - from) * secsz;
230	if (ch != NULL && ch->ch_type == CH_TYPE_ZEROES) {
231		sz = image_chunk_grow(ch, sz);
232		if (sz == 0)
233			return (0);
234		from = ch->ch_block + (ch->ch_size / secsz);
235	}
236	ch = malloc(sizeof(*ch));
237	if (ch == NULL)
238		return (ENOMEM);
239	memset(ch, 0, sizeof(*ch));
240	ch->ch_block = from;
241	ch->ch_size = sz;
242	ch->ch_type = CH_TYPE_ZEROES;
243	TAILQ_INSERT_TAIL(&image_chunks, ch, ch_list);
244	image_nchunks++;
245	return (0);
246}
247
248static int
249image_chunk_append(lba_t blk, size_t sz, off_t ofs, int fd)
250{
251	struct chunk *ch;
252
253	ch = TAILQ_LAST(&image_chunks, chunk_head);
254	if (ch != NULL && ch->ch_type == CH_TYPE_FILE) {
255		if (fd == ch->ch_u.file.fd &&
256		    blk == (lba_t)(ch->ch_block + (ch->ch_size / secsz)) &&
257		    ofs == (off_t)(ch->ch_u.file.ofs + ch->ch_size)) {
258			sz = image_chunk_grow(ch, sz);
259			if (sz == 0)
260				return (0);
261			blk = ch->ch_block + (ch->ch_size / secsz);
262			ofs = ch->ch_u.file.ofs + ch->ch_size;
263		}
264	}
265	ch = malloc(sizeof(*ch));
266	if (ch == NULL)
267		return (ENOMEM);
268	memset(ch, 0, sizeof(*ch));
269	ch->ch_block = blk;
270	ch->ch_size = sz;
271	ch->ch_type = CH_TYPE_FILE;
272	ch->ch_u.file.ofs = ofs;
273	ch->ch_u.file.fd = fd;
274	TAILQ_INSERT_TAIL(&image_chunks, ch, ch_list);
275	image_nchunks++;
276	return (0);
277}
278
279static int
280image_chunk_copyin(lba_t blk, void *buf, size_t sz, off_t ofs, int fd)
281{
282	uint8_t *p = buf;
283	int error;
284
285	error = 0;
286	sz = (sz + secsz - 1) & ~(secsz - 1);
287	while (!error && sz > 0) {
288		if (is_empty_sector(p))
289			error = image_chunk_skipto(blk + 1);
290		else
291			error = image_chunk_append(blk, secsz, ofs, fd);
292		blk++;
293		p += secsz;
294		sz -= secsz;
295		ofs += secsz;
296	}
297	return (error);
298}
299
300/*
301 * File mapping support.
302 */
303
304static void *
305image_file_map(int fd, off_t ofs, size_t sz, off_t *iofp)
306{
307	void *ptr;
308	size_t unit;
309	int flags, prot;
310	off_t x;
311
312	/* On Linux anyway ofs must also be page aligned */
313	if ((x = (ofs % image_swap_pgsz)) != 0) {
314	    ofs -= x;
315	    sz += x;
316	    *iofp = x;
317	} else
318	    *iofp = 0;
319	unit = (secsz > image_swap_pgsz) ? secsz : image_swap_pgsz;
320	assert((unit & (unit - 1)) == 0);
321
322	flags = MAP_NOCORE | MAP_NOSYNC | MAP_SHARED;
323	/* Allow writing to our swap file only. */
324	prot = PROT_READ | ((fd == image_swap_fd) ? PROT_WRITE : 0);
325	sz = (sz + unit - 1) & ~(unit - 1);
326	ptr = mmap(NULL, sz, prot, flags, fd, ofs);
327	return ((ptr == MAP_FAILED) ? NULL : ptr);
328}
329
330static int
331image_file_unmap(void *buffer, size_t sz)
332{
333	size_t unit;
334
335	unit = (secsz > image_swap_pgsz) ? secsz : image_swap_pgsz;
336	sz = (sz + unit - 1) & ~(unit - 1);
337	if (madvise(buffer, sz, MADV_DONTNEED) != 0)
338		warn("madvise");
339	munmap(buffer, sz);
340	return (0);
341}
342
343/*
344 * Input/source file handling.
345 */
346
347static int
348image_copyin_stream(lba_t blk, int fd, uint64_t *sizep)
349{
350	char *buffer;
351	uint64_t bytesize;
352	off_t swofs;
353	size_t iosz;
354	ssize_t rdsz;
355	int error;
356	off_t iof;
357
358	/*
359	 * This makes sure we're doing I/O in multiples of the page
360	 * size as well as of the sector size. 2MB is the minimum
361	 * by virtue of secsz at least 512 bytes and the page size
362	 * at least 4K bytes.
363	 */
364	iosz = secsz * image_swap_pgsz;
365
366	bytesize = 0;
367	do {
368		swofs = image_swap_alloc(iosz);
369		if (swofs == -1LL)
370			return (errno);
371		buffer = image_file_map(image_swap_fd, swofs, iosz, &iof);
372		if (buffer == NULL)
373			return (errno);
374		rdsz = read(fd, &buffer[iof], iosz);
375		if (rdsz > 0)
376			error = image_chunk_copyin(blk, &buffer[iof], rdsz, swofs,
377			    image_swap_fd);
378		else if (rdsz < 0)
379			error = errno;
380		else
381			error = 0;
382		image_file_unmap(buffer, iosz);
383		/* XXX should we relinguish unused swap space? */
384		if (error)
385			return (error);
386
387		bytesize += rdsz;
388		blk += (rdsz + secsz - 1) / secsz;
389	} while (rdsz > 0);
390
391	if (sizep != NULL)
392		*sizep = bytesize;
393	return (0);
394}
395
396static int
397image_copyin_mapped(lba_t blk, int fd, uint64_t *sizep)
398{
399	off_t cur, data, end, hole, pos, iof;
400	void *mp;
401	char *buf;
402	uint64_t bytesize;
403	size_t iosz, sz;
404	int error;
405
406	/*
407	 * We'd like to know the size of the file and we must
408	 * be able to seek in order to mmap(2). If this isn't
409	 * possible, then treat the file as a stream/pipe.
410	 */
411	end = lseek(fd, 0L, SEEK_END);
412	if (end == -1L)
413		return (image_copyin_stream(blk, fd, sizep));
414
415	/*
416	 * We need the file opened for the duration and our
417	 * caller is going to close the file. Make a dup(2)
418	 * so that control the faith of the descriptor.
419	 */
420	fd = dup(fd);
421	if (fd == -1)
422		return (errno);
423
424	iosz = secsz * image_swap_pgsz;
425
426	bytesize = 0;
427	cur = pos = 0;
428	error = 0;
429	while (!error && cur < end) {
430		hole = lseek(fd, cur, SEEK_HOLE);
431		if (hole == -1)
432			hole = end;
433		data = lseek(fd, cur, SEEK_DATA);
434		if (data == -1)
435			data = end;
436
437		/*
438		 * Treat the entire file as data if sparse files
439		 * are not supported by the underlying file system.
440		 */
441		if (hole == end && data == end)
442			data = cur;
443
444		if (cur == hole && data > hole) {
445			hole = pos;
446			pos = data & ~((uint64_t)secsz - 1);
447
448			blk += (pos - hole) / secsz;
449			error = image_chunk_skipto(blk);
450
451			bytesize += pos - hole;
452			cur = data;
453		} else if (cur == data && hole > data) {
454			data = pos;
455			pos = (hole + secsz - 1) & ~((uint64_t)secsz - 1);
456
457			while (data < pos) {
458				sz = (pos - data > (off_t)iosz)
459				    ? iosz : (size_t)(pos - data);
460
461				buf = mp = image_file_map(fd, data, sz, &iof);
462				if (mp != NULL) {
463					buf += iof;
464					error = image_chunk_copyin(blk, buf,
465					    sz, data, fd);
466					image_file_unmap(mp, sz);
467				} else
468					error = errno;
469
470				blk += sz / secsz;
471				bytesize += sz;
472				data += sz;
473			}
474			cur = hole;
475		} else {
476			/*
477			 * I don't know what this means or whether it
478			 * can happen at all...
479			 */
480			assert(0);
481		}
482	}
483	if (error)
484		close(fd);
485	if (!error && sizep != NULL)
486		*sizep = bytesize;
487	return (error);
488}
489
490int
491image_copyin(lba_t blk, int fd, uint64_t *sizep)
492{
493	struct stat sb;
494	int error;
495
496	error = image_chunk_skipto(blk);
497	if (!error) {
498		if (fstat(fd, &sb) == -1 || !S_ISREG(sb.st_mode))
499			error = image_copyin_stream(blk, fd, sizep);
500		else
501			error = image_copyin_mapped(blk, fd, sizep);
502	}
503	return (error);
504}
505
506/*
507 * Output/sink file handling.
508 */
509
510int
511image_copyout(int fd)
512{
513	int error;
514
515	error = image_copyout_region(fd, 0, image_size);
516	if (!error)
517		error = image_copyout_done(fd);
518	return (error);
519}
520
521int
522image_copyout_done(int fd)
523{
524	off_t ofs;
525	int error;
526
527	ofs = lseek(fd, 0L, SEEK_CUR);
528	if (ofs == -1)
529		return (0);
530	error = (ftruncate(fd, ofs) == -1) ? errno : 0;
531	return (error);
532}
533
534static int
535image_copyout_memory(int fd, size_t size, void *ptr)
536{
537
538	if (write(fd, ptr, size) == -1)
539		return (errno);
540	return (0);
541}
542
543int
544image_copyout_zeroes(int fd, size_t count)
545{
546	static uint8_t *zeroes = NULL;
547	size_t sz;
548	int error;
549
550	if (lseek(fd, (off_t)count, SEEK_CUR) != -1)
551		return (0);
552
553	/*
554	 * If we can't seek, we must write.
555	 */
556
557	if (zeroes == NULL) {
558		zeroes = calloc(1, secsz);
559		if (zeroes == NULL)
560			return (ENOMEM);
561	}
562
563	while (count > 0) {
564		sz = (count > secsz) ? secsz : count;
565		error = image_copyout_memory(fd, sz, zeroes);
566		if (error)
567			return (error);
568		count -= sz;
569	}
570	return (0);
571}
572
573static int
574image_copyout_file(int fd, size_t size, int ifd, off_t iofs)
575{
576	void *mp;
577	char *buf;
578	size_t iosz, sz;
579	int error;
580	off_t iof;
581
582	iosz = secsz * image_swap_pgsz;
583
584	while (size > 0) {
585		sz = (size > iosz) ? iosz : size;
586		buf = mp = image_file_map(ifd, iofs, sz, &iof);
587		if (buf == NULL)
588			return (errno);
589		buf += iof;
590		error = image_copyout_memory(fd, sz, buf);
591		image_file_unmap(mp, sz);
592		if (error)
593			return (error);
594		size -= sz;
595		iofs += sz;
596	}
597	return (0);
598}
599
600int
601image_copyout_region(int fd, lba_t blk, lba_t size)
602{
603	struct chunk *ch;
604	size_t ofs, sz;
605	int error;
606
607	size *= secsz;
608
609	error = 0;
610	while (!error && size > 0) {
611		ch = image_chunk_find(blk);
612		if (ch == NULL) {
613			error = EINVAL;
614			break;
615		}
616		ofs = (blk - ch->ch_block) * secsz;
617		sz = ch->ch_size - ofs;
618		sz = ((lba_t)sz < size) ? sz : (size_t)size;
619		switch (ch->ch_type) {
620		case CH_TYPE_ZEROES:
621			error = image_copyout_zeroes(fd, sz);
622			break;
623		case CH_TYPE_FILE:
624			error = image_copyout_file(fd, sz, ch->ch_u.file.fd,
625			    ch->ch_u.file.ofs + ofs);
626			break;
627		case CH_TYPE_MEMORY:
628			error = image_copyout_memory(fd, sz, ch->ch_u.mem.ptr);
629			break;
630		default:
631			assert(0);
632		}
633		size -= sz;
634		blk += sz / secsz;
635	}
636	return (error);
637}
638
639int
640image_data(lba_t blk, lba_t size)
641{
642	struct chunk *ch;
643	lba_t lim;
644
645	while (1) {
646		ch = image_chunk_find(blk);
647		if (ch == NULL)
648			return (0);
649		if (ch->ch_type != CH_TYPE_ZEROES)
650			return (1);
651		lim = ch->ch_block + (ch->ch_size / secsz);
652		if (lim >= blk + size)
653			return (0);
654		size -= lim - blk;
655		blk = lim;
656	}
657	/*NOTREACHED*/
658}
659
660lba_t
661image_get_size(void)
662{
663
664	return (image_size);
665}
666
667int
668image_set_size(lba_t blk)
669{
670	int error;
671
672	error = image_chunk_skipto(blk);
673	if (!error)
674		image_size = blk;
675	return (error);
676}
677
678int
679image_write(lba_t blk, void *buf, ssize_t len)
680{
681	struct chunk *ch;
682
683	while (len > 0) {
684		if (!is_empty_sector(buf)) {
685			ch = image_chunk_find(blk);
686			if (ch == NULL)
687				return (ENXIO);
688			/* We may not be able to write to files. */
689			if (ch->ch_type == CH_TYPE_FILE)
690				return (EINVAL);
691			if (ch->ch_type == CH_TYPE_ZEROES) {
692				ch = image_chunk_memory(ch, blk);
693				if (ch == NULL)
694					return (ENOMEM);
695			}
696			assert(ch->ch_type == CH_TYPE_MEMORY);
697			memcpy(ch->ch_u.mem.ptr, buf, secsz);
698		}
699		blk++;
700		buf = (char *)buf + secsz;
701		len--;
702	}
703	return (0);
704}
705
706static void
707image_cleanup(void)
708{
709	struct chunk *ch;
710
711	while ((ch = TAILQ_FIRST(&image_chunks)) != NULL) {
712		switch (ch->ch_type) {
713		case CH_TYPE_FILE:
714			/* We may be closing the same file multiple times. */
715			if (ch->ch_u.file.fd != -1)
716				close(ch->ch_u.file.fd);
717			break;
718		case CH_TYPE_MEMORY:
719			free(ch->ch_u.mem.ptr);
720			break;
721		default:
722			break;
723		}
724		TAILQ_REMOVE(&image_chunks, ch, ch_list);
725		free(ch);
726	}
727	if (image_swap_fd != -1)
728		close(image_swap_fd);
729	unlink(image_swap_file);
730}
731
732int
733image_init(void)
734{
735	const char *tmpdir;
736
737	TAILQ_INIT(&image_chunks);
738	image_nchunks = 0;
739
740	image_swap_size = 0;
741	image_swap_pgsz = getpagesize();
742
743	if (atexit(image_cleanup) == -1)
744		return (errno);
745	if ((tmpdir = getenv("TMPDIR")) == NULL || *tmpdir == '\0')
746		tmpdir = _PATH_TMP;
747	snprintf(image_swap_file, sizeof(image_swap_file), "%s/mkimg-XXXXXX",
748	    tmpdir);
749	image_swap_fd = mkstemp(image_swap_file);
750	if (image_swap_fd == -1)
751		return (errno);
752	return (0);
753}
754