vioqcow2.c revision 1.16
1/*	$OpenBSD: vioqcow2.c,v 1.16 2021/06/16 16:55:02 dv Exp $	*/
2
3/*
4 * Copyright (c) 2018 Ori Bernstein <ori@eigenstate.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19#include <sys/types.h>
20#include <sys/stat.h>
21
22#include <dev/pci/pcireg.h>
23#include <machine/vmmvar.h>
24
25#include <assert.h>
26#include <err.h>
27#include <errno.h>
28#include <fcntl.h>
29#include <libgen.h>
30#include <stdlib.h>
31#include <string.h>
32#include <unistd.h>
33
34#include "virtio.h"
35
36#define QCOW2_COMPRESSED	0x4000000000000000ull
37#define QCOW2_INPLACE		0x8000000000000000ull
38
39#define QCOW2_DIRTY		(1 << 0)
40#define QCOW2_CORRUPT		(1 << 1)
41
42enum {
43	ICFEATURE_DIRTY		= 1 << 0,
44	ICFEATURE_CORRUPT	= 1 << 1,
45};
46
47enum {
48	ACFEATURE_BITEXT	= 1 << 0,
49};
50
51struct qcheader {
52	char magic[4];
53	uint32_t version;
54	uint64_t backingoff;
55	uint32_t backingsz;
56	uint32_t clustershift;
57	uint64_t disksz;
58	uint32_t cryptmethod;
59	uint32_t l1sz;
60	uint64_t l1off;
61	uint64_t refoff;
62	uint32_t refsz;
63	uint32_t snapcount;
64	uint64_t snapsz;
65	/* v3 additions */
66	uint64_t incompatfeatures;
67	uint64_t compatfeatures;
68	uint64_t autoclearfeatures;
69	uint32_t reforder;	/* Bits = 1 << reforder */
70	uint32_t headersz;
71} __packed;
72
73struct qcdisk {
74	pthread_rwlock_t lock;
75	struct qcdisk *base;
76	struct qcheader header;
77
78	int       fd;
79	uint64_t *l1;
80	off_t     end;
81	off_t	  clustersz;
82	off_t	  disksz; /* In bytes */
83	uint32_t  cryptmethod;
84
85	uint32_t l1sz;
86	off_t	 l1off;
87
88	off_t	 refoff;
89	off_t	 refsz;
90
91	uint32_t nsnap;
92	off_t	 snapoff;
93
94	/* v3 features */
95	uint64_t incompatfeatures;
96	uint64_t autoclearfeatures;
97	uint32_t refssz;
98	uint32_t headersz;
99};
100
101extern char *__progname;
102
103static off_t xlate(struct qcdisk *, off_t, int *);
104static void copy_cluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
105static void inc_refs(struct qcdisk *, off_t, int);
106static off_t mkcluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
107static int qc2_open(struct qcdisk *, int *, size_t);
108static ssize_t qc2_pread(void *, char *, size_t, off_t);
109static ssize_t qc2_pwrite(void *, char *, size_t, off_t);
110static void qc2_close(void *, int);
111
112/*
113 * Initializes a raw disk image backing file from an fd.
114 * Stores the number of 512 byte sectors in *szp,
115 * returning -1 for error, 0 for success.
116 *
117 * May open snapshot base images.
118 */
119int
120virtio_qcow2_init(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd)
121{
122	struct qcdisk *diskp;
123
124	diskp = malloc(sizeof(struct qcdisk));
125	if (diskp == NULL)
126		return -1;
127	if (qc2_open(diskp, fd, nfd) == -1) {
128		log_warnx("could not open qcow2 disk");
129		return -1;
130	}
131	file->p = diskp;
132	file->pread = qc2_pread;
133	file->pwrite = qc2_pwrite;
134	file->close = qc2_close;
135	*szp = diskp->disksz;
136	return 0;
137}
138
139/*
140 * Return the path to the base image given a disk image.
141 * Called from vmctl.
142 */
143ssize_t
144virtio_qcow2_get_base(int fd, char *path, size_t npath, const char *dpath)
145{
146	char dpathbuf[PATH_MAX];
147	char expanded[PATH_MAX];
148	struct qcheader header;
149	uint64_t backingoff;
150	uint32_t backingsz;
151	char *s = NULL;
152
153	if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) {
154		log_warnx("short read on header");
155		return -1;
156	}
157	if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0) {
158		log_warnx("invalid magic numbers");
159		return -1;
160	}
161	backingoff = be64toh(header.backingoff);
162	backingsz = be32toh(header.backingsz);
163	if (backingsz == 0)
164		return 0;
165
166	if (backingsz >= npath - 1) {
167		log_warnx("snapshot path too long");
168		return -1;
169	}
170	if (pread(fd, path, backingsz, backingoff) != backingsz) {
171		log_warnx("could not read snapshot base name");
172		return -1;
173	}
174	path[backingsz] = '\0';
175
176	/*
177	 * Relative paths should be interpreted relative to the disk image,
178	 * rather than relative to the directory vmd happens to be running in,
179	 * since this is the only userful interpretation.
180	 */
181	if (path[0] == '/') {
182		if (realpath(path, expanded) == NULL ||
183		    strlcpy(path, expanded, npath) >= npath) {
184			log_warnx("unable to resolve %s", path);
185			return -1;
186		}
187	} else {
188		if (strlcpy(dpathbuf, dpath, sizeof(dpathbuf)) >=
189		    sizeof(dpathbuf)) {
190			log_warnx("path too long: %s", dpath);
191			return -1;
192		}
193		s = dirname(dpathbuf);
194		if (snprintf(expanded, sizeof(expanded),
195		    "%s/%s", s, path) >= (int)sizeof(expanded)) {
196			log_warnx("path too long: %s/%s", s, path);
197			return -1;
198		}
199		if (npath < PATH_MAX ||
200		    realpath(expanded, path) == NULL) {
201			log_warnx("unable to resolve %s", path);
202			return -1;
203		}
204	}
205
206	return strlen(path);
207}
208
209static int
210qc2_open(struct qcdisk *disk, int *fds, size_t nfd)
211{
212	char basepath[PATH_MAX];
213	struct stat st;
214	struct qcheader header;
215	uint64_t backingoff;
216	uint32_t backingsz;
217	off_t i;
218	int version, fd;
219
220	pthread_rwlock_init(&disk->lock, NULL);
221	fd = fds[0];
222	disk->fd = fd;
223	disk->base = NULL;
224	disk->l1 = NULL;
225
226	if (pread(fd, &header, sizeof(header), 0) != sizeof(header))
227		fatalx("short read on header");
228	if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0)
229		fatalx("invalid magic numbers");
230
231	disk->clustersz		= (1ull << be32toh(header.clustershift));
232	disk->disksz		= be64toh(header.disksz);
233	disk->cryptmethod	= be32toh(header.cryptmethod);
234	disk->l1sz		= be32toh(header.l1sz);
235	disk->l1off		= be64toh(header.l1off);
236	disk->refsz		= be32toh(header.refsz);
237	disk->refoff		= be64toh(header.refoff);
238	disk->nsnap		= be32toh(header.snapcount);
239	disk->snapoff		= be64toh(header.snapsz);
240
241	/*
242	 * The additional features here are defined as 0 in the v2 format,
243	 * so as long as we clear the buffer before parsing, we don't need
244	 * to check versions here.
245	 */
246	disk->incompatfeatures = be64toh(header.incompatfeatures);
247	disk->autoclearfeatures = be64toh(header.autoclearfeatures);
248	disk->refssz = be32toh(header.refsz);
249	disk->headersz = be32toh(header.headersz);
250
251	/*
252	 * We only know about the dirty or corrupt bits here.
253	 */
254	if (disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT))
255		fatalx("unsupported features %llx",
256		    disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT));
257	if (be32toh(header.reforder) != 4)
258		fatalx("unsupported refcount size\n");
259
260	disk->l1 = calloc(disk->l1sz, sizeof(*disk->l1));
261	if (!disk->l1)
262		fatal("%s: could not allocate l1 table", __func__);
263	if (pread(disk->fd, disk->l1, 8 * disk->l1sz, disk->l1off)
264	    != 8 * disk->l1sz)
265		fatalx("%s: unable to read qcow2 L1 table", __func__);
266	for (i = 0; i < disk->l1sz; i++)
267		disk->l1[i] = be64toh(disk->l1[i]);
268	version = be32toh(header.version);
269	if (version != 2 && version != 3)
270		fatalx("%s: unknown qcow2 version %d", __func__, version);
271
272	backingoff = be64toh(header.backingoff);
273	backingsz = be32toh(header.backingsz);
274	if (backingsz != 0) {
275		if (backingsz >= sizeof(basepath) - 1) {
276			fatalx("%s: snapshot path too long", __func__);
277		}
278		if (pread(fd, basepath, backingsz, backingoff) != backingsz) {
279			fatalx("%s: could not read snapshot base name",
280			    __func__);
281		}
282		basepath[backingsz] = 0;
283		if (nfd <= 1) {
284			fatalx("%s: missing base image %s", __func__,
285			    basepath);
286		}
287
288
289		disk->base = calloc(1, sizeof(struct qcdisk));
290		if (!disk->base)
291			fatal("%s: could not open %s", __func__, basepath);
292		if (qc2_open(disk->base, fds + 1, nfd - 1) == -1)
293			fatalx("%s: could not open %s", __func__, basepath);
294		if (disk->base->clustersz != disk->clustersz)
295			fatalx("%s: all disk parts must share clustersize",
296			    __func__);
297	}
298	if (fstat(fd, &st) == -1)
299		fatal("%s: unable to stat disk", __func__);
300
301	disk->end = st.st_size;
302
303	log_debug("%s: qcow2 disk version %d size %lld end %lld snap %d",
304	    __func__, version, disk->disksz, disk->end, disk->nsnap);
305
306	return 0;
307}
308
309static ssize_t
310qc2_pread(void *p, char *buf, size_t len, off_t off)
311{
312	struct qcdisk *disk, *d;
313	off_t phys_off, end, cluster_off;
314	ssize_t sz, rem;
315
316	disk = p;
317	end = off + len;
318	if (off < 0 || end > disk->disksz)
319		return -1;
320
321	/* handle head chunk separately */
322	rem = len;
323	while (off != end) {
324		for (d = disk; d; d = d->base)
325			if ((phys_off = xlate(d, off, NULL)) > 0)
326				break;
327		/* Break out into chunks. This handles
328		 * three cases:
329		 *
330		 *    |----+====|========|====+-----|
331		 *
332		 * Either we are at the start of the read,
333		 * and the cluster has some leading bytes.
334		 * This means that we are reading the tail
335		 * of the cluster, and our size is:
336		 *
337		 * 	clustersz - (off % clustersz).
338		 *
339		 * Otherwise, we're reading the middle section.
340		 * We're already aligned here, so we can just
341		 * read the whole cluster size. Or we're at the
342		 * tail, at which point we just want to read the
343		 * remaining bytes.
344		 */
345		cluster_off = off % disk->clustersz;
346		sz = disk->clustersz - cluster_off;
347		if (sz > rem)
348			sz = rem;
349		/*
350		 * If we're within the disk, but don't have backing bytes,
351		 * just read back zeros.
352		 */
353		if (!d)
354			bzero(buf, sz);
355		else if (pread(d->fd, buf, sz, phys_off) != sz)
356			return -1;
357		off += sz;
358		buf += sz;
359		rem -= sz;
360	}
361	return len;
362}
363
364ssize_t
365qc2_pwrite(void *p, char *buf, size_t len, off_t off)
366{
367	struct qcdisk *disk, *d;
368	off_t phys_off, cluster_off, end;
369	ssize_t sz, rem;
370	int inplace;
371
372	d = p;
373	disk = p;
374	inplace = 1;
375	end = off + len;
376	if (off < 0 || end > disk->disksz)
377		return -1;
378	rem = len;
379	while (off != end) {
380		/* See the read code for a summary of the computation */
381		cluster_off = off % disk->clustersz;
382		sz = disk->clustersz - cluster_off;
383		if (sz > rem)
384			sz = rem;
385
386		phys_off = xlate(disk, off, &inplace);
387		if (phys_off == -1)
388			return -1;
389		/*
390		 * If we couldn't find the cluster in the writable disk,
391		 * see if it exists in the base image. If it does, we
392		 * need to copy it before the write. The copy happens
393		 * in the '!inplace' if clause below te search.
394		 */
395		if (phys_off == 0)
396			for (d = disk->base; d; d = d->base)
397				if ((phys_off = xlate(d, off, NULL)) > 0)
398					break;
399		if (!inplace || phys_off == 0)
400			phys_off = mkcluster(disk, d, off, phys_off);
401		if (phys_off == -1)
402			return -1;
403		if (phys_off < disk->clustersz)
404			fatalx("%s: writing reserved cluster", __func__);
405		if (pwrite(disk->fd, buf, sz, phys_off) != sz)
406			return -1;
407		off += sz;
408		buf += sz;
409		rem -= sz;
410	}
411	return len;
412}
413
414static void
415qc2_close(void *p, int stayopen)
416{
417	struct qcdisk *disk;
418
419	disk = p;
420	if (disk->base)
421		qc2_close(disk->base, stayopen);
422	if (!stayopen)
423		close(disk->fd);
424	free(disk->l1);
425	free(disk);
426}
427
428/*
429 * Translates a virtual offset into an on-disk offset.
430 * Returns:
431 * 	-1 on error
432 * 	 0 on 'not found'
433 * 	>0 on found
434 */
435static off_t
436xlate(struct qcdisk *disk, off_t off, int *inplace)
437{
438	off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff;
439	uint64_t buf;
440
441
442	/*
443	 * Clear out inplace flag -- xlate misses should not
444	 * be flagged as updatable in place. We will still
445	 * return 0 from them, but this leaves less surprises
446	 * in the API.
447	 */
448	if (inplace)
449		*inplace = 0;
450	pthread_rwlock_rdlock(&disk->lock);
451	if (off < 0)
452		goto err;
453
454	l2sz = disk->clustersz / 8;
455	l1off = (off / disk->clustersz) / l2sz;
456	if (l1off >= disk->l1sz)
457		goto err;
458
459	l2tab = disk->l1[l1off];
460	l2tab &= ~QCOW2_INPLACE;
461	if (l2tab == 0) {
462		pthread_rwlock_unlock(&disk->lock);
463		return 0;
464	}
465	l2off = (off / disk->clustersz) % l2sz;
466	pread(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8);
467	cluster = be64toh(buf);
468	/*
469	 * cluster may be 0, but all future operations don't affect
470	 * the return value.
471	 */
472	if (inplace)
473		*inplace = !!(cluster & QCOW2_INPLACE);
474	if (cluster & QCOW2_COMPRESSED)
475		fatalx("%s: compressed clusters unsupported", __func__);
476	pthread_rwlock_unlock(&disk->lock);
477	clusteroff = 0;
478	cluster &= ~QCOW2_INPLACE;
479	if (cluster)
480		clusteroff = off % disk->clustersz;
481	return cluster + clusteroff;
482err:
483	pthread_rwlock_unlock(&disk->lock);
484	return -1;
485}
486
487/*
488 * Allocates a new cluster on disk, creating a new L2 table
489 * if needed. The cluster starts off with a refs of one,
490 * and the writable bit set.
491 *
492 * Returns -1 on error, and the physical address within the
493 * cluster of the write offset if it exists.
494 */
495static off_t
496mkcluster(struct qcdisk *disk, struct qcdisk *base, off_t off, off_t src_phys)
497{
498	off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff, orig;
499	uint64_t buf;
500	int fd;
501
502	pthread_rwlock_wrlock(&disk->lock);
503
504	cluster = -1;
505	fd = disk->fd;
506	/* L1 entries always exist */
507	l2sz = disk->clustersz / 8;
508	l1off = off / (disk->clustersz * l2sz);
509	if (l1off >= disk->l1sz)
510		fatalx("l1 offset outside disk");
511
512	disk->end = (disk->end + disk->clustersz - 1) & ~(disk->clustersz - 1);
513
514	l2tab = disk->l1[l1off];
515	l2off = (off / disk->clustersz) % l2sz;
516	/* We may need to create or clone an L2 entry to map the block */
517	if (l2tab == 0 || (l2tab & QCOW2_INPLACE) == 0) {
518		orig = l2tab & ~QCOW2_INPLACE;
519		l2tab = disk->end;
520		disk->end += disk->clustersz;
521		if (ftruncate(disk->fd, disk->end) == -1)
522			fatal("%s: ftruncate failed", __func__);
523
524		/*
525		 * If we translated, found a L2 entry, but it needed to
526		 * be copied, copy it.
527		 */
528		if (orig != 0)
529			copy_cluster(disk, disk, l2tab, orig);
530		/* Update l1 -- we flush it later */
531		disk->l1[l1off] = l2tab | QCOW2_INPLACE;
532		inc_refs(disk, l2tab, 1);
533	}
534	l2tab &= ~QCOW2_INPLACE;
535
536	/* Grow the disk */
537	if (ftruncate(disk->fd, disk->end + disk->clustersz) < 0)
538		fatalx("%s: could not grow disk", __func__);
539	if (src_phys > 0)
540		copy_cluster(disk, base, disk->end, src_phys);
541	cluster = disk->end;
542	disk->end += disk->clustersz;
543	buf = htobe64(cluster | QCOW2_INPLACE);
544	if (pwrite(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8) != 8)
545		fatalx("%s: could not write cluster", __func__);
546
547	/* TODO: lazily sync: currently VMD doesn't close things */
548	buf = htobe64(disk->l1[l1off]);
549	if (pwrite(disk->fd, &buf, sizeof(buf), disk->l1off + 8 * l1off) != 8)
550		fatalx("%s: could not write l1", __func__);
551	inc_refs(disk, cluster, 1);
552
553	pthread_rwlock_unlock(&disk->lock);
554	clusteroff = off % disk->clustersz;
555	if (cluster + clusteroff < disk->clustersz)
556		fatalx("write would clobber header");
557	return cluster + clusteroff;
558}
559
560/* Copies a cluster containing src to dst. Src and dst need not be aligned. */
561static void
562copy_cluster(struct qcdisk *disk, struct qcdisk *base, off_t dst, off_t src)
563{
564	char *scratch;
565
566	scratch = malloc(disk->clustersz);
567	if (!scratch)
568		fatal("out of memory");
569	src &= ~(disk->clustersz - 1);
570	dst &= ~(disk->clustersz - 1);
571	if (pread(base->fd, scratch, disk->clustersz, src) == -1)
572		fatal("%s: could not read cluster", __func__);
573	if (pwrite(disk->fd, scratch, disk->clustersz, dst) == -1)
574		fatal("%s: could not write cluster", __func__);
575	free(scratch);
576}
577
578static void
579inc_refs(struct qcdisk *disk, off_t off, int newcluster)
580{
581	off_t l1off, l1idx, l2idx, l2cluster;
582	size_t nper;
583	uint16_t refs;
584	uint64_t buf;
585
586	off &= ~QCOW2_INPLACE;
587	nper = disk->clustersz / 2;
588	l1idx = (off / disk->clustersz) / nper;
589	l2idx = (off / disk->clustersz) % nper;
590	l1off = disk->refoff + 8 * l1idx;
591	if (pread(disk->fd, &buf, sizeof(buf), l1off) != 8)
592		fatal("could not read refs");
593
594	l2cluster = be64toh(buf);
595	if (l2cluster == 0) {
596		l2cluster = disk->end;
597		disk->end += disk->clustersz;
598		if (ftruncate(disk->fd, disk->end) < 0)
599			fatal("%s: failed to allocate ref block", __func__);
600		buf = htobe64(l2cluster);
601		if (pwrite(disk->fd, &buf, sizeof(buf), l1off) != 8)
602			fatal("%s: failed to write ref block", __func__);
603	}
604
605	refs = 1;
606	if (!newcluster) {
607		if (pread(disk->fd, &refs, sizeof(refs),
608		    l2cluster + 2 * l2idx) != 2)
609			fatal("could not read ref cluster");
610		refs = be16toh(refs) + 1;
611	}
612	refs = htobe16(refs);
613	if (pwrite(disk->fd, &refs, sizeof(refs), l2cluster + 2 * l2idx) != 2)
614		fatal("%s: could not write ref block", __func__);
615}
616
617/*
618 * virtio_qcow2_create
619 *
620 * Create an empty qcow2 imagefile with the specified path and size.
621 *
622 * Parameters:
623 *  imgfile_path: path to the image file to create
624 *  imgsize     : size of the image file to create (in MB)
625 *
626 * Return:
627 *  EEXIST: The requested image file already exists
628 *  0     : Image file successfully created
629 *  Exxxx : Various other Exxxx errno codes due to other I/O errors
630 */
631int
632virtio_qcow2_create(const char *imgfile_path,
633    const char *base_path, long imgsize)
634{
635	struct qcheader hdr, basehdr;
636	int fd, ret;
637	ssize_t base_len;
638	uint64_t l1sz, refsz, disksz, initsz, clustersz;
639	uint64_t l1off, refoff, v, i, l1entrysz, refentrysz;
640	uint16_t refs;
641
642	disksz = 1024 * 1024 * imgsize;
643
644	if (base_path) {
645		fd = open(base_path, O_RDONLY);
646		if (read(fd, &basehdr, sizeof(basehdr)) != sizeof(basehdr))
647			err(1, "failure to read base image header");
648		close(fd);
649		if (strncmp(basehdr.magic,
650		    VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0)
651			errx(1, "base image is not a qcow2 file");
652		if (!disksz)
653			disksz = betoh64(basehdr.disksz);
654		else if (disksz != betoh64(basehdr.disksz))
655			errx(1, "base size does not match requested size");
656	}
657	if (!base_path && !disksz)
658		errx(1, "missing disk size");
659
660	clustersz = (1<<16);
661	l1off = ALIGNSZ(sizeof(hdr), clustersz);
662
663	l1entrysz = clustersz * clustersz / 8;
664	l1sz = (disksz + l1entrysz - 1) / l1entrysz;
665
666	refoff = ALIGNSZ(l1off + 8*l1sz, clustersz);
667	refentrysz = clustersz * clustersz * clustersz / 2;
668	refsz = (disksz + refentrysz - 1) / refentrysz;
669
670	initsz = ALIGNSZ(refoff + refsz*clustersz, clustersz);
671	base_len = base_path ? strlen(base_path) : 0;
672
673	memcpy(hdr.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW));
674	hdr.version		= htobe32(3);
675	hdr.backingoff		= htobe64(base_path ? sizeof(hdr) : 0);
676	hdr.backingsz		= htobe32(base_len);
677	hdr.clustershift	= htobe32(16);
678	hdr.disksz		= htobe64(disksz);
679	hdr.cryptmethod		= htobe32(0);
680	hdr.l1sz		= htobe32(l1sz);
681	hdr.l1off		= htobe64(l1off);
682	hdr.refoff		= htobe64(refoff);
683	hdr.refsz		= htobe32(refsz);
684	hdr.snapcount		= htobe32(0);
685	hdr.snapsz		= htobe64(0);
686	hdr.incompatfeatures	= htobe64(0);
687	hdr.compatfeatures	= htobe64(0);
688	hdr.autoclearfeatures	= htobe64(0);
689	hdr.reforder		= htobe32(4);
690	hdr.headersz		= htobe32(sizeof(hdr));
691
692	/* Refuse to overwrite an existing image */
693	fd = open(imgfile_path, O_RDWR | O_CREAT | O_TRUNC | O_EXCL,
694	    S_IRUSR | S_IWUSR);
695	if (fd == -1)
696		return (errno);
697
698	/* Write out the header */
699	if (write(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
700		goto error;
701
702	/* Add the base image */
703	if (base_path && write(fd, base_path, base_len) != base_len)
704		goto error;
705
706	/* Extend to desired size, and add one refcount cluster */
707	if (ftruncate(fd, (off_t)initsz + clustersz) == -1)
708		goto error;
709
710	/*
711	 * Paranoia: if our disk image takes more than one cluster
712	 * to refcount the initial image, fail.
713	 */
714	if (initsz/clustersz > clustersz/2) {
715		errno = ERANGE;
716		goto error;
717	}
718
719	/* Add a refcount block, and refcount ourselves. */
720	v = htobe64(initsz);
721	if (pwrite(fd, &v, 8, refoff) != 8)
722		goto error;
723	for (i = 0; i < initsz/clustersz + 1; i++) {
724		refs = htobe16(1);
725		if (pwrite(fd, &refs, 2, initsz + 2*i) != 2)
726			goto error;
727	}
728
729	ret = close(fd);
730	return (ret);
731error:
732	ret = errno;
733	close(fd);
734	unlink(imgfile_path);
735	return (errno);
736}
737