vioqcow2.c revision 1.13
1/*	$OpenBSD: vioqcow2.c,v 1.13 2019/01/10 19:21:02 deraadt Exp $	*/
2
3/*
4 * Copyright (c) 2018 Ori Bernstein <ori@eigenstate.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19#include <sys/types.h>
20#include <sys/stat.h>
21
22#include <machine/vmmvar.h>
23#include <dev/pci/pcireg.h>
24
25#include <stdlib.h>
26#include <string.h>
27#include <unistd.h>
28#include <fcntl.h>
29#include <assert.h>
30#include <libgen.h>
31#include <err.h>
32#include <errno.h>
33
34#include "vmd.h"
35#include "vmm.h"
36#include "virtio.h"
37
38#define QCOW2_COMPRESSED	0x4000000000000000ull
39#define QCOW2_INPLACE		0x8000000000000000ull
40
41#define QCOW2_DIRTY		(1 << 0)
42#define QCOW2_CORRUPT		(1 << 1)
43
44enum {
45	ICFEATURE_DIRTY		= 1 << 0,
46	ICFEATURE_CORRUPT	= 1 << 1,
47};
48
49enum {
50	ACFEATURE_BITEXT	= 1 << 0,
51};
52
53struct qcheader {
54	char magic[4];
55	uint32_t version;
56	uint64_t backingoff;
57	uint32_t backingsz;
58	uint32_t clustershift;
59	uint64_t disksz;
60	uint32_t cryptmethod;
61	uint32_t l1sz;
62	uint64_t l1off;
63	uint64_t refoff;
64	uint32_t refsz;
65	uint32_t snapcount;
66	uint64_t snapsz;
67	/* v3 additions */
68	uint64_t incompatfeatures;
69	uint64_t compatfeatures;
70	uint64_t autoclearfeatures;
71	uint32_t reforder;	/* Bits = 1 << reforder */
72	uint32_t headersz;
73} __packed;
74
75struct qcdisk {
76	pthread_rwlock_t lock;
77	struct qcdisk *base;
78	struct qcheader header;
79
80	int       fd;
81	uint64_t *l1;
82	off_t     end;
83	off_t	  clustersz;
84	off_t	  disksz; /* In bytes */
85	uint32_t  cryptmethod;
86
87	uint32_t l1sz;
88	off_t	 l1off;
89
90	off_t	 refoff;
91	off_t	 refsz;
92
93	uint32_t nsnap;
94	off_t	 snapoff;
95
96	/* v3 features */
97	uint64_t incompatfeatures;
98	uint64_t autoclearfeatures;
99	uint32_t refssz;
100	uint32_t headersz;
101};
102
103extern char *__progname;
104
105static off_t xlate(struct qcdisk *, off_t, int *);
106static void copy_cluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
107static void inc_refs(struct qcdisk *, off_t, int);
108static off_t mkcluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
109static int qc2_open(struct qcdisk *, int *, size_t);
110static ssize_t qc2_pread(void *, char *, size_t, off_t);
111static ssize_t qc2_pwrite(void *, char *, size_t, off_t);
112static void qc2_close(void *, int);
113
114/*
115 * Initializes a raw disk image backing file from an fd.
116 * Stores the number of 512 byte sectors in *szp,
117 * returning -1 for error, 0 for success.
118 *
119 * May open snapshot base images.
120 */
121int
122virtio_qcow2_init(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd)
123{
124	struct qcdisk *diskp;
125
126	diskp = malloc(sizeof(struct qcdisk));
127	if (diskp == NULL)
128		return -1;
129	if (qc2_open(diskp, fd, nfd) == -1) {
130		log_warnx("could not open qcow2 disk");
131		return -1;
132	}
133	file->p = diskp;
134	file->pread = qc2_pread;
135	file->pwrite = qc2_pwrite;
136	file->close = qc2_close;
137	*szp = diskp->disksz;
138	return 0;
139}
140
141/*
142 * Return the path to the base image given a disk image.
143 * Called from vmctl.
144 */
145ssize_t
146virtio_qcow2_get_base(int fd, char *path, size_t npath, const char *dpath)
147{
148	char expanded[PATH_MAX];
149	struct qcheader header;
150	uint64_t backingoff;
151	uint32_t backingsz;
152	char *s = NULL;
153
154	if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) {
155		log_warnx("short read on header");
156		return -1;
157	}
158	if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0) {
159		log_warnx("invalid magic numbers");
160		return -1;
161	}
162	backingoff = be64toh(header.backingoff);
163	backingsz = be32toh(header.backingsz);
164	if (backingsz == 0)
165		return 0;
166
167	if (backingsz >= npath - 1) {
168		log_warnx("snapshot path too long");
169		return -1;
170	}
171	if (pread(fd, path, backingsz, backingoff) != backingsz) {
172		log_warnx("could not read snapshot base name");
173		return -1;
174	}
175	path[backingsz] = '\0';
176
177	/*
178	 * Relative paths should be interpreted relative to the disk image,
179	 * rather than relative to the directory vmd happens to be running in,
180	 * since this is the only userful interpretation.
181	 */
182	if (path[0] == '/') {
183		if (realpath(path, expanded) == NULL ||
184		    strlcpy(path, expanded, npath) >= npath) {
185			log_warnx("unable to resolve %s", path);
186			return -1;
187		}
188	} else {
189		s = dirname(dpath);
190		if (snprintf(expanded, sizeof(expanded),
191		    "%s/%s", s, path) >= (int)sizeof(expanded)) {
192			log_warnx("path too long: %s/%s", s, path);
193			return -1;
194		}
195		if (npath < PATH_MAX ||
196		    realpath(expanded, path) == NULL) {
197			log_warnx("unable to resolve %s", path);
198			return -1;
199		}
200	}
201
202	return strlen(path);
203}
204
205static int
206qc2_open(struct qcdisk *disk, int *fds, size_t nfd)
207{
208	char basepath[PATH_MAX];
209	struct stat st;
210	struct qcheader header;
211	uint64_t backingoff;
212	uint32_t backingsz;
213	off_t i;
214	int version, fd;
215
216	pthread_rwlock_init(&disk->lock, NULL);
217	fd = fds[0];
218	disk->fd = fd;
219	disk->base = NULL;
220	disk->l1 = NULL;
221
222	if (pread(fd, &header, sizeof(header), 0) != sizeof(header))
223		fatalx("short read on header");
224	if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0)
225		fatalx("invalid magic numbers");
226
227	disk->clustersz		= (1ull << be32toh(header.clustershift));
228	disk->disksz		= be64toh(header.disksz);
229	disk->cryptmethod	= be32toh(header.cryptmethod);
230	disk->l1sz		= be32toh(header.l1sz);
231	disk->l1off		= be64toh(header.l1off);
232	disk->refsz		= be32toh(header.refsz);
233	disk->refoff		= be64toh(header.refoff);
234	disk->nsnap		= be32toh(header.snapcount);
235	disk->snapoff		= be64toh(header.snapsz);
236
237	/*
238	 * The additional features here are defined as 0 in the v2 format,
239	 * so as long as we clear the buffer before parsing, we don't need
240	 * to check versions here.
241	 */
242	disk->incompatfeatures = be64toh(header.incompatfeatures);
243	disk->autoclearfeatures = be64toh(header.autoclearfeatures);
244	disk->refssz = be32toh(header.refsz);
245	disk->headersz = be32toh(header.headersz);
246
247	/*
248	 * We only know about the dirty or corrupt bits here.
249	 */
250	if (disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT))
251		fatalx("unsupported features %llx",
252		    disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT));
253	if (be32toh(header.reforder) != 4)
254		fatalx("unsupported refcount size\n");
255
256	disk->l1 = calloc(disk->l1sz, sizeof(*disk->l1));
257	if (!disk->l1)
258		fatal("%s: could not allocate l1 table", __func__);
259	if (pread(disk->fd, disk->l1, 8 * disk->l1sz, disk->l1off)
260	    != 8 * disk->l1sz)
261		fatalx("%s: unable to read qcow2 L1 table", __func__);
262	for (i = 0; i < disk->l1sz; i++)
263		disk->l1[i] = be64toh(disk->l1[i]);
264	version = be32toh(header.version);
265	if (version != 2 && version != 3)
266		fatalx("%s: unknown qcow2 version %d", __func__, version);
267
268	backingoff = be64toh(header.backingoff);
269	backingsz = be32toh(header.backingsz);
270	if (backingsz != 0) {
271		if (backingsz >= sizeof(basepath) - 1) {
272			fatalx("%s: snapshot path too long", __func__);
273		}
274		if (pread(fd, basepath, backingsz, backingoff) != backingsz) {
275			fatalx("%s: could not read snapshot base name",
276			    __func__);
277		}
278		basepath[backingsz] = 0;
279		if (nfd <= 1) {
280			fatalx("%s: missing base image %s", __func__,
281			    basepath);
282		}
283
284
285		disk->base = calloc(1, sizeof(struct qcdisk));
286		if (!disk->base)
287			fatal("%s: could not open %s", __func__, basepath);
288		if (qc2_open(disk->base, fds + 1, nfd - 1) == -1)
289			fatalx("%s: could not open %s", __func__, basepath);
290		if (disk->base->clustersz != disk->clustersz)
291			fatalx("%s: all disk parts must share clustersize",
292			    __func__);
293	}
294	if (fstat(fd, &st) == -1)
295		fatal("%s: unable to stat disk", __func__);
296
297	disk->end = st.st_size;
298
299	log_debug("%s: qcow2 disk version %d size %lld end %lld snap %d",
300	    __func__, version, disk->disksz, disk->end, disk->nsnap);
301
302	return 0;
303}
304
305static ssize_t
306qc2_pread(void *p, char *buf, size_t len, off_t off)
307{
308	struct qcdisk *disk, *d;
309	off_t phys_off, end, cluster_off;
310	ssize_t sz, rem;
311
312	disk = p;
313	end = off + len;
314	if (off < 0 || end > disk->disksz)
315		return -1;
316
317	/* handle head chunk separately */
318	rem = len;
319	while (off != end) {
320		for (d = disk; d; d = d->base)
321			if ((phys_off = xlate(d, off, NULL)) > 0)
322				break;
323		/* Break out into chunks. This handles
324		 * three cases:
325		 *
326		 *    |----+====|========|====+-----|
327		 *
328		 * Either we are at the start of the read,
329		 * and the cluster has some leading bytes.
330		 * This means that we are reading the tail
331		 * of the cluster, and our size is:
332		 *
333		 * 	clustersz - (off % clustersz).
334		 *
335		 * Otherwise, we're reading the middle section.
336		 * We're already aligned here, so we can just
337		 * read the whole cluster size. Or we're at the
338		 * tail, at which point we just want to read the
339		 * remaining bytes.
340		 */
341		cluster_off = off % disk->clustersz;
342		sz = disk->clustersz - cluster_off;
343		if (sz > rem)
344			sz = rem;
345		/*
346		 * If we're within the disk, but don't have backing bytes,
347		 * just read back zeros.
348		 */
349		if (!d)
350			bzero(buf, sz);
351		else if (pread(d->fd, buf, sz, phys_off) != sz)
352			return -1;
353		off += sz;
354		buf += sz;
355		rem -= sz;
356	}
357	return len;
358}
359
360ssize_t
361qc2_pwrite(void *p, char *buf, size_t len, off_t off)
362{
363	struct qcdisk *disk, *d;
364	off_t phys_off, cluster_off, end;
365	ssize_t sz, rem;
366	int inplace;
367
368	d = p;
369	disk = p;
370	inplace = 1;
371	end = off + len;
372	if (off < 0 || end > disk->disksz)
373		return -1;
374	rem = len;
375	while (off != end) {
376		/* See the read code for a summary of the computation */
377		cluster_off = off % disk->clustersz;
378		sz = disk->clustersz - cluster_off;
379		if (sz > rem)
380			sz = rem;
381
382		phys_off = xlate(disk, off, &inplace);
383		if (phys_off == -1)
384			return -1;
385		/*
386		 * If we couldn't find the cluster in the writable disk,
387		 * see if it exists in the base image. If it does, we
388		 * need to copy it before the write. The copy happens
389		 * in the '!inplace' if clause below te search.
390		 */
391		if (phys_off == 0)
392			for (d = disk->base; d; d = d->base)
393				if ((phys_off = xlate(d, off, NULL)) > 0)
394					break;
395		if (!inplace || phys_off == 0)
396			phys_off = mkcluster(disk, d, off, phys_off);
397		if (phys_off == -1)
398			return -1;
399		if (phys_off < disk->clustersz)
400			fatalx("%s: writing reserved cluster", __func__);
401		if (pwrite(disk->fd, buf, sz, phys_off) != sz)
402			return -1;
403		off += sz;
404		buf += sz;
405		rem -= sz;
406	}
407	return len;
408}
409
410static void
411qc2_close(void *p, int stayopen)
412{
413	struct qcdisk *disk;
414
415	disk = p;
416	if (disk->base)
417		qc2_close(disk->base, stayopen);
418	if (!stayopen)
419		close(disk->fd);
420	free(disk->l1);
421	free(disk);
422}
423
424/*
425 * Translates a virtual offset into an on-disk offset.
426 * Returns:
427 * 	-1 on error
428 * 	 0 on 'not found'
429 * 	>0 on found
430 */
431static off_t
432xlate(struct qcdisk *disk, off_t off, int *inplace)
433{
434	off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff;
435	uint64_t buf;
436
437
438	/*
439	 * Clear out inplace flag -- xlate misses should not
440	 * be flagged as updatable in place. We will still
441	 * return 0 from them, but this leaves less surprises
442	 * in the API.
443	 */
444	if (inplace)
445		*inplace = 0;
446	pthread_rwlock_rdlock(&disk->lock);
447	if (off < 0)
448		goto err;
449
450	l2sz = disk->clustersz / 8;
451	l1off = (off / disk->clustersz) / l2sz;
452	if (l1off >= disk->l1sz)
453		goto err;
454
455	l2tab = disk->l1[l1off];
456	l2tab &= ~QCOW2_INPLACE;
457	if (l2tab == 0) {
458		pthread_rwlock_unlock(&disk->lock);
459		return 0;
460	}
461	l2off = (off / disk->clustersz) % l2sz;
462	pread(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8);
463	cluster = be64toh(buf);
464	/*
465	 * cluster may be 0, but all future operations don't affect
466	 * the return value.
467	 */
468	if (inplace)
469		*inplace = !!(cluster & QCOW2_INPLACE);
470	if (cluster & QCOW2_COMPRESSED)
471		fatalx("%s: compressed clusters unsupported", __func__);
472	pthread_rwlock_unlock(&disk->lock);
473	clusteroff = 0;
474	cluster &= ~QCOW2_INPLACE;
475	if (cluster)
476		clusteroff = off % disk->clustersz;
477	return cluster + clusteroff;
478err:
479	pthread_rwlock_unlock(&disk->lock);
480	return -1;
481}
482
483/*
484 * Allocates a new cluster on disk, creating a new L2 table
485 * if needed. The cluster starts off with a refs of one,
486 * and the writable bit set.
487 *
488 * Returns -1 on error, and the physical address within the
489 * cluster of the write offset if it exists.
490 */
491static off_t
492mkcluster(struct qcdisk *disk, struct qcdisk *base, off_t off, off_t src_phys)
493{
494	off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff, orig;
495	uint64_t buf;
496	int fd;
497
498	pthread_rwlock_wrlock(&disk->lock);
499
500	cluster = -1;
501	fd = disk->fd;
502	/* L1 entries always exist */
503	l2sz = disk->clustersz / 8;
504	l1off = off / (disk->clustersz * l2sz);
505	if (l1off >= disk->l1sz)
506		fatalx("l1 offset outside disk");
507
508	disk->end = (disk->end + disk->clustersz - 1) & ~(disk->clustersz - 1);
509
510	l2tab = disk->l1[l1off];
511	l2off = (off / disk->clustersz) % l2sz;
512	/* We may need to create or clone an L2 entry to map the block */
513	if (l2tab == 0 || (l2tab & QCOW2_INPLACE) == 0) {
514		orig = l2tab & ~QCOW2_INPLACE;
515		l2tab = disk->end;
516		disk->end += disk->clustersz;
517		if (ftruncate(disk->fd, disk->end) == -1)
518			fatal("%s: ftruncate failed", __func__);
519
520		/*
521		 * If we translated, found a L2 entry, but it needed to
522		 * be copied, copy it.
523		 */
524		if (orig != 0)
525			copy_cluster(disk, disk, l2tab, orig);
526		/* Update l1 -- we flush it later */
527		disk->l1[l1off] = l2tab | QCOW2_INPLACE;
528		inc_refs(disk, l2tab, 1);
529	}
530	l2tab &= ~QCOW2_INPLACE;
531
532	/* Grow the disk */
533	if (ftruncate(disk->fd, disk->end + disk->clustersz) < 0)
534		fatalx("%s: could not grow disk", __func__);
535	if (src_phys > 0)
536		copy_cluster(disk, base, disk->end, src_phys);
537	cluster = disk->end;
538	disk->end += disk->clustersz;
539	buf = htobe64(cluster | QCOW2_INPLACE);
540	if (pwrite(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8) != 8)
541		fatalx("%s: could not write cluster", __func__);
542
543	/* TODO: lazily sync: currently VMD doesn't close things */
544	buf = htobe64(disk->l1[l1off]);
545	if (pwrite(disk->fd, &buf, sizeof(buf), disk->l1off + 8 * l1off) != 8)
546		fatalx("%s: could not write l1", __func__);
547	inc_refs(disk, cluster, 1);
548
549	pthread_rwlock_unlock(&disk->lock);
550	clusteroff = off % disk->clustersz;
551	if (cluster + clusteroff < disk->clustersz)
552		fatalx("write would clobber header");
553	return cluster + clusteroff;
554}
555
556/* Copies a cluster containing src to dst. Src and dst need not be aligned. */
557static void
558copy_cluster(struct qcdisk *disk, struct qcdisk *base, off_t dst, off_t src)
559{
560	char *scratch;
561
562	scratch = malloc(disk->clustersz);
563	if (!scratch)
564		fatal("out of memory");
565	src &= ~(disk->clustersz - 1);
566	dst &= ~(disk->clustersz - 1);
567	if (pread(base->fd, scratch, disk->clustersz, src) == -1)
568		fatal("%s: could not read cluster", __func__);
569	if (pwrite(disk->fd, scratch, disk->clustersz, dst) == -1)
570		fatal("%s: could not write cluster", __func__);
571	free(scratch);
572}
573
574static void
575inc_refs(struct qcdisk *disk, off_t off, int newcluster)
576{
577	off_t l1off, l1idx, l2idx, l2cluster;
578	size_t nper;
579	uint16_t refs;
580	uint64_t buf;
581
582	off &= ~QCOW2_INPLACE;
583	nper = disk->clustersz / 2;
584	l1idx = (off / disk->clustersz) / nper;
585	l2idx = (off / disk->clustersz) % nper;
586	l1off = disk->refoff + 8 * l1idx;
587	if (pread(disk->fd, &buf, sizeof(buf), l1off) != 8)
588		fatal("could not read refs");
589
590	l2cluster = be64toh(buf);
591	if (l2cluster == 0) {
592		l2cluster = disk->end;
593		disk->end += disk->clustersz;
594		if (ftruncate(disk->fd, disk->end) < 0)
595			fatal("%s: failed to allocate ref block", __func__);
596		buf = htobe64(l2cluster);
597		if (pwrite(disk->fd, &buf, sizeof(buf), l1off) != 8)
598			fatal("%s: failed to write ref block", __func__);
599	}
600
601	refs = 1;
602	if (!newcluster) {
603		if (pread(disk->fd, &refs, sizeof(refs),
604		    l2cluster + 2 * l2idx) != 2)
605			fatal("could not read ref cluster");
606		refs = be16toh(refs) + 1;
607	}
608	refs = htobe16(refs);
609	if (pwrite(disk->fd, &refs, sizeof(refs), l2cluster + 2 * l2idx) != 2)
610		fatal("%s: could not write ref block", __func__);
611}
612
613/*
614 * virtio_qcow2_create
615 *
616 * Create an empty qcow2 imagefile with the specified path and size.
617 *
618 * Parameters:
619 *  imgfile_path: path to the image file to create
620 *  imgsize     : size of the image file to create (in MB)
621 *
622 * Return:
623 *  EEXIST: The requested image file already exists
624 *  0     : Image file successfully created
625 *  Exxxx : Various other Exxxx errno codes due to other I/O errors
626 */
627int
628virtio_qcow2_create(const char *imgfile_path,
629    const char *base_path, long imgsize)
630{
631	struct qcheader {
632		char magic[4];
633		uint32_t version;
634		uint64_t backingoff;
635		uint32_t backingsz;
636		uint32_t clustershift;
637		uint64_t disksz;
638		uint32_t cryptmethod;
639		uint32_t l1sz;
640		uint64_t l1off;
641		uint64_t refoff;
642		uint32_t refsz;
643		uint32_t snapcount;
644		uint64_t snapsz;
645		/* v3 additions */
646		uint64_t incompatfeatures;
647		uint64_t compatfeatures;
648		uint64_t autoclearfeatures;
649		uint32_t reforder;
650		uint32_t headersz;
651	} __packed hdr, basehdr;
652	int fd, ret;
653	ssize_t base_len;
654	uint64_t l1sz, refsz, disksz, initsz, clustersz;
655	uint64_t l1off, refoff, v, i, l1entrysz, refentrysz;
656	uint16_t refs;
657
658	disksz = 1024 * 1024 * imgsize;
659
660	if (base_path) {
661		fd = open(base_path, O_RDONLY);
662		if (read(fd, &basehdr, sizeof(basehdr)) != sizeof(basehdr))
663			err(1, "failure to read base image header");
664		close(fd);
665		if (strncmp(basehdr.magic,
666		    VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0)
667			errx(1, "base image is not a qcow2 file");
668		if (!disksz)
669			disksz = betoh64(basehdr.disksz);
670		else if (disksz != betoh64(basehdr.disksz))
671			errx(1, "base size does not match requested size");
672	}
673	if (!base_path && !disksz)
674		errx(1, "missing disk size");
675
676	clustersz = (1<<16);
677	l1off = ALIGNSZ(sizeof(hdr), clustersz);
678
679	l1entrysz = clustersz * clustersz / 8;
680	l1sz = (disksz + l1entrysz - 1) / l1entrysz;
681
682	refoff = ALIGNSZ(l1off + 8*l1sz, clustersz);
683	refentrysz = clustersz * clustersz * clustersz / 2;
684	refsz = (disksz + refentrysz - 1) / refentrysz;
685
686	initsz = ALIGNSZ(refoff + refsz*clustersz, clustersz);
687	base_len = base_path ? strlen(base_path) : 0;
688
689	memcpy(hdr.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW));
690	hdr.version		= htobe32(3);
691	hdr.backingoff		= htobe64(base_path ? sizeof(hdr) : 0);
692	hdr.backingsz		= htobe32(base_len);
693	hdr.clustershift	= htobe32(16);
694	hdr.disksz		= htobe64(disksz);
695	hdr.cryptmethod		= htobe32(0);
696	hdr.l1sz		= htobe32(l1sz);
697	hdr.l1off		= htobe64(l1off);
698	hdr.refoff		= htobe64(refoff);
699	hdr.refsz		= htobe32(refsz);
700	hdr.snapcount		= htobe32(0);
701	hdr.snapsz		= htobe64(0);
702	hdr.incompatfeatures	= htobe64(0);
703	hdr.compatfeatures	= htobe64(0);
704	hdr.autoclearfeatures	= htobe64(0);
705	hdr.reforder		= htobe32(4);
706	hdr.headersz		= htobe32(sizeof(hdr));
707
708	/* Refuse to overwrite an existing image */
709	fd = open(imgfile_path, O_RDWR | O_CREAT | O_TRUNC | O_EXCL,
710	    S_IRUSR | S_IWUSR);
711	if (fd == -1)
712		return (errno);
713
714	/* Write out the header */
715	if (write(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
716		goto error;
717
718	/* Add the base image */
719	if (base_path && write(fd, base_path, base_len) != base_len)
720		goto error;
721
722	/* Extend to desired size, and add one refcount cluster */
723	if (ftruncate(fd, (off_t)initsz + clustersz) == -1)
724		goto error;
725
726	/*
727	 * Paranoia: if our disk image takes more than one cluster
728	 * to refcount the initial image, fail.
729	 */
730	if (initsz/clustersz > clustersz/2) {
731		errno = ERANGE;
732		goto error;
733	}
734
735	/* Add a refcount block, and refcount ourselves. */
736	v = htobe64(initsz);
737	if (pwrite(fd, &v, 8, refoff) != 8)
738		goto error;
739	for (i = 0; i < initsz/clustersz + 1; i++) {
740		refs = htobe16(1);
741		if (pwrite(fd, &refs, 2, initsz + 2*i) != 2)
742			goto error;
743	}
744
745	ret = close(fd);
746	return (ret);
747error:
748	ret = errno;
749	close(fd);
750	unlink(imgfile_path);
751	return (errno);
752}
753