1/*	$OpenBSD: vioqcow2.c,v 1.24 2023/09/14 15:25:43 dv Exp $	*/
2
3/*
4 * Copyright (c) 2018 Ori Bernstein <ori@eigenstate.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19#include <sys/types.h>
20#include <sys/stat.h>
21
22#include <dev/pci/pcireg.h>
23
24#include <assert.h>
25#include <err.h>
26#include <errno.h>
27#include <fcntl.h>
28#include <libgen.h>
29#include <stdlib.h>
30#include <string.h>
31#include <unistd.h>
32
33#include "virtio.h"
34
35#define QCOW2_COMPRESSED	0x4000000000000000ull
36#define QCOW2_INPLACE		0x8000000000000000ull
37
38#define QCOW2_DIRTY		(1 << 0)
39#define QCOW2_CORRUPT		(1 << 1)
40
41enum {
42	ICFEATURE_DIRTY		= 1 << 0,
43	ICFEATURE_CORRUPT	= 1 << 1,
44};
45
46enum {
47	ACFEATURE_BITEXT	= 1 << 0,
48};
49
50struct qcheader {
51	char magic[4];
52	uint32_t version;
53	uint64_t backingoff;
54	uint32_t backingsz;
55	uint32_t clustershift;
56	uint64_t disksz;
57	uint32_t cryptmethod;
58	uint32_t l1sz;
59	uint64_t l1off;
60	uint64_t refoff;
61	uint32_t refsz;
62	uint32_t snapcount;
63	uint64_t snapsz;
64	/* v3 additions */
65	uint64_t incompatfeatures;
66	uint64_t compatfeatures;
67	uint64_t autoclearfeatures;
68	uint32_t reforder;	/* Bits = 1 << reforder */
69	uint32_t headersz;
70} __packed;
71
72struct qcdisk {
73	pthread_rwlock_t lock;
74	struct qcdisk *base;
75	struct qcheader header;
76
77	int       fd;
78	uint64_t *l1;
79	off_t     end;
80	off_t	  clustersz;
81	off_t	  disksz; /* In bytes */
82	uint32_t  cryptmethod;
83
84	uint32_t l1sz;
85	off_t	 l1off;
86
87	off_t	 refoff;
88	off_t	 refsz;
89
90	uint32_t nsnap;
91	off_t	 snapoff;
92
93	/* v3 features */
94	uint64_t incompatfeatures;
95	uint64_t autoclearfeatures;
96	uint32_t refssz;
97	uint32_t headersz;
98};
99
100extern char *__progname;
101
102static off_t xlate(struct qcdisk *, off_t, int *);
103static void copy_cluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
104static void inc_refs(struct qcdisk *, off_t, int);
105static off_t mkcluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
106static int qc2_open(struct qcdisk *, int *, size_t);
107static ssize_t qc2_pread(void *, char *, size_t, off_t);
108static ssize_t qc2_preadv(void *, struct iovec *, int, off_t);
109static ssize_t qc2_pwrite(void *, char *, size_t, off_t);
110static ssize_t qc2_pwritev(void *, struct iovec *, int, off_t);
111static void qc2_close(void *, int);
112
113/*
114 * Initializes a raw disk image backing file from an fd. Stores the
115 * number of bytes in *szp, returning -1 for error, 0 for success.
116 *
117 * May open snapshot base images.
118 */
119int
120virtio_qcow2_init(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd)
121{
122	struct qcdisk *diskp;
123
124	diskp = malloc(sizeof(struct qcdisk));
125	if (diskp == NULL)
126		return -1;
127	if (qc2_open(diskp, fd, nfd) == -1) {
128		log_warnx("could not open qcow2 disk");
129		return -1;
130	}
131	file->p = diskp;
132	file->pread = qc2_pread;
133	file->preadv = qc2_preadv;
134	file->pwrite = qc2_pwrite;
135	file->pwritev = qc2_pwritev;
136	file->close = qc2_close;
137	*szp = diskp->disksz;
138	return 0;
139}
140
141/*
142 * Return the path to the base image given a disk image.
143 * Called from vmctl.
144 */
145ssize_t
146virtio_qcow2_get_base(int fd, char *path, size_t npath, const char *dpath)
147{
148	char dpathbuf[PATH_MAX];
149	char expanded[PATH_MAX];
150	struct qcheader header;
151	uint64_t backingoff;
152	uint32_t backingsz;
153	char *s = NULL;
154
155	if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) {
156		log_warnx("short read on header");
157		return -1;
158	}
159	if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0) {
160		log_warnx("invalid magic numbers");
161		return -1;
162	}
163	backingoff = be64toh(header.backingoff);
164	backingsz = be32toh(header.backingsz);
165	if (backingsz == 0)
166		return 0;
167
168	if (backingsz >= npath - 1) {
169		log_warnx("snapshot path too long");
170		return -1;
171	}
172	if (pread(fd, path, backingsz, backingoff) != backingsz) {
173		log_warnx("could not read snapshot base name");
174		return -1;
175	}
176	path[backingsz] = '\0';
177
178	/*
179	 * Relative paths should be interpreted relative to the disk image,
180	 * rather than relative to the directory vmd happens to be running in,
181	 * since this is the only useful interpretation.
182	 */
183	if (path[0] == '/') {
184		if (realpath(path, expanded) == NULL ||
185		    strlcpy(path, expanded, npath) >= npath) {
186			log_warnx("unable to resolve %s", path);
187			return -1;
188		}
189	} else {
190		if (strlcpy(dpathbuf, dpath, sizeof(dpathbuf)) >=
191		    sizeof(dpathbuf)) {
192			log_warnx("path too long: %s", dpath);
193			return -1;
194		}
195		s = dirname(dpathbuf);
196		if (snprintf(expanded, sizeof(expanded),
197		    "%s/%s", s, path) >= (int)sizeof(expanded)) {
198			log_warnx("path too long: %s/%s", s, path);
199			return -1;
200		}
201		if (npath < PATH_MAX ||
202		    realpath(expanded, path) == NULL) {
203			log_warnx("unable to resolve %s", path);
204			return -1;
205		}
206	}
207
208	return strlen(path);
209}
210
211static int
212qc2_open(struct qcdisk *disk, int *fds, size_t nfd)
213{
214	char basepath[PATH_MAX];
215	struct stat st;
216	struct qcheader header;
217	uint64_t backingoff;
218	uint32_t backingsz;
219	off_t i;
220	int version, fd;
221
222	pthread_rwlock_init(&disk->lock, NULL);
223	fd = fds[0];
224	disk->fd = fd;
225	disk->base = NULL;
226	disk->l1 = NULL;
227
228	if (pread(fd, &header, sizeof(header), 0) != sizeof(header))
229		fatalx("short read on header");
230	if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0)
231		fatalx("invalid magic numbers");
232
233	disk->clustersz		= (1ull << be32toh(header.clustershift));
234	disk->disksz		= be64toh(header.disksz);
235	disk->cryptmethod	= be32toh(header.cryptmethod);
236	disk->l1sz		= be32toh(header.l1sz);
237	disk->l1off		= be64toh(header.l1off);
238	disk->refsz		= be32toh(header.refsz);
239	disk->refoff		= be64toh(header.refoff);
240	disk->nsnap		= be32toh(header.snapcount);
241	disk->snapoff		= be64toh(header.snapsz);
242
243	/*
244	 * The additional features here are defined as 0 in the v2 format,
245	 * so as long as we clear the buffer before parsing, we don't need
246	 * to check versions here.
247	 */
248	disk->incompatfeatures = be64toh(header.incompatfeatures);
249	disk->autoclearfeatures = be64toh(header.autoclearfeatures);
250	disk->refssz = be32toh(header.refsz);
251	disk->headersz = be32toh(header.headersz);
252
253	/*
254	 * We only know about the dirty or corrupt bits here.
255	 */
256	if (disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT))
257		fatalx("unsupported features %llx",
258		    disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT));
259	if (be32toh(header.reforder) != 4)
260		fatalx("unsupported refcount size\n");
261
262	disk->l1 = calloc(disk->l1sz, sizeof(*disk->l1));
263	if (!disk->l1)
264		fatal("%s: could not allocate l1 table", __func__);
265	if (pread(disk->fd, disk->l1, 8 * disk->l1sz, disk->l1off)
266	    != 8 * disk->l1sz)
267		fatalx("%s: unable to read qcow2 L1 table", __func__);
268	for (i = 0; i < disk->l1sz; i++)
269		disk->l1[i] = be64toh(disk->l1[i]);
270	version = be32toh(header.version);
271	if (version != 2 && version != 3)
272		fatalx("%s: unknown qcow2 version %d", __func__, version);
273
274	backingoff = be64toh(header.backingoff);
275	backingsz = be32toh(header.backingsz);
276	if (backingsz != 0) {
277		if (backingsz >= sizeof(basepath) - 1) {
278			fatalx("%s: snapshot path too long", __func__);
279		}
280		if (pread(fd, basepath, backingsz, backingoff) != backingsz) {
281			fatalx("%s: could not read snapshot base name",
282			    __func__);
283		}
284		basepath[backingsz] = 0;
285		if (nfd <= 1) {
286			fatalx("%s: missing base image %s", __func__,
287			    basepath);
288		}
289
290
291		disk->base = calloc(1, sizeof(struct qcdisk));
292		if (!disk->base)
293			fatal("%s: could not open %s", __func__, basepath);
294		if (qc2_open(disk->base, fds + 1, nfd - 1) == -1)
295			fatalx("%s: could not open %s", __func__, basepath);
296		if (disk->base->clustersz != disk->clustersz)
297			fatalx("%s: all disk parts must share clustersize",
298			    __func__);
299	}
300	if (fstat(fd, &st) == -1)
301		fatal("%s: unable to stat disk", __func__);
302
303	disk->end = st.st_size;
304
305	log_debug("%s: qcow2 disk version %d size %lld end %lld snap %d",
306	    __func__, version, disk->disksz, disk->end, disk->nsnap);
307
308	return 0;
309}
310
311static ssize_t
312qc2_preadv(void *p, struct iovec *iov, int cnt, off_t offset)
313{
314	int i;
315	off_t pos = offset;
316	ssize_t sz = 0, total = 0;
317
318	for (i = 0; i < cnt; i++, iov++) {
319		sz = qc2_pread(p, iov->iov_base, iov->iov_len, pos);
320		if (sz == -1)
321			return (sz);
322		total += sz;
323		pos += sz;
324	}
325
326	return (total);
327}
328
329static ssize_t
330qc2_pread(void *p, char *buf, size_t len, off_t off)
331{
332	struct qcdisk *disk, *d;
333	off_t phys_off, end, cluster_off;
334	ssize_t sz, rem;
335
336	disk = p;
337	end = off + len;
338	if (off < 0 || end > disk->disksz)
339		return -1;
340
341	/* handle head chunk separately */
342	rem = len;
343	while (off != end) {
344		for (d = disk; d; d = d->base)
345			if ((phys_off = xlate(d, off, NULL)) > 0)
346				break;
347		/* Break out into chunks. This handles
348		 * three cases:
349		 *
350		 *    |----+====|========|====+-----|
351		 *
352		 * Either we are at the start of the read,
353		 * and the cluster has some leading bytes.
354		 * This means that we are reading the tail
355		 * of the cluster, and our size is:
356		 *
357		 * 	clustersz - (off % clustersz).
358		 *
359		 * Otherwise, we're reading the middle section.
360		 * We're already aligned here, so we can just
361		 * read the whole cluster size. Or we're at the
362		 * tail, at which point we just want to read the
363		 * remaining bytes.
364		 */
365		cluster_off = off % disk->clustersz;
366		sz = disk->clustersz - cluster_off;
367		if (sz > rem)
368			sz = rem;
369		/*
370		 * If we're within the disk, but don't have backing bytes,
371		 * just read back zeros.
372		 */
373		if (!d)
374			bzero(buf, sz);
375		else if (pread(d->fd, buf, sz, phys_off) != sz)
376			return -1;
377		off += sz;
378		buf += sz;
379		rem -= sz;
380	}
381	return len;
382}
383
384static ssize_t
385qc2_pwritev(void *p, struct iovec *iov, int cnt, off_t offset)
386{
387	int i;
388	off_t pos = offset;
389	ssize_t sz = 0, total = 0;
390
391	for (i = 0; i < cnt; i++, iov++) {
392		sz = qc2_pwrite(p, iov->iov_base, iov->iov_len, pos);
393		if (sz == -1)
394			return (sz);
395		total += sz;
396		pos += sz;
397	}
398
399	return (total);
400}
401
402static ssize_t
403qc2_pwrite(void *p, char *buf, size_t len, off_t off)
404{
405	struct qcdisk *disk, *d;
406	off_t phys_off, cluster_off, end;
407	ssize_t sz, rem;
408	int inplace;
409
410	d = p;
411	disk = p;
412	inplace = 1;
413	end = off + len;
414	if (off < 0 || end > disk->disksz)
415		return -1;
416	rem = len;
417	while (off != end) {
418		/* See the read code for a summary of the computation */
419		cluster_off = off % disk->clustersz;
420		sz = disk->clustersz - cluster_off;
421		if (sz > rem)
422			sz = rem;
423
424		phys_off = xlate(disk, off, &inplace);
425		if (phys_off == -1)
426			return -1;
427		/*
428		 * If we couldn't find the cluster in the writable disk,
429		 * see if it exists in the base image. If it does, we
430		 * need to copy it before the write. The copy happens
431		 * in the '!inplace' if clause below te search.
432		 */
433		if (phys_off == 0)
434			for (d = disk->base; d; d = d->base)
435				if ((phys_off = xlate(d, off, NULL)) > 0)
436					break;
437		if (!inplace || phys_off == 0)
438			phys_off = mkcluster(disk, d, off, phys_off);
439		if (phys_off == -1)
440			return -1;
441		if (phys_off < disk->clustersz)
442			fatalx("%s: writing reserved cluster", __func__);
443		if (pwrite(disk->fd, buf, sz, phys_off) != sz)
444			return -1;
445		off += sz;
446		buf += sz;
447		rem -= sz;
448	}
449	return len;
450}
451
452static void
453qc2_close(void *p, int stayopen)
454{
455	struct qcdisk *disk;
456
457	disk = p;
458	if (disk->base)
459		qc2_close(disk->base, stayopen);
460	if (!stayopen)
461		close(disk->fd);
462	free(disk->l1);
463	free(disk);
464}
465
466/*
467 * Translates a virtual offset into an on-disk offset.
468 * Returns:
469 * 	-1 on error
470 * 	 0 on 'not found'
471 * 	>0 on found
472 */
473static off_t
474xlate(struct qcdisk *disk, off_t off, int *inplace)
475{
476	off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff;
477	uint64_t buf;
478
479
480	/*
481	 * Clear out inplace flag -- xlate misses should not
482	 * be flagged as updatable in place. We will still
483	 * return 0 from them, but this leaves less surprises
484	 * in the API.
485	 */
486	if (inplace)
487		*inplace = 0;
488	pthread_rwlock_rdlock(&disk->lock);
489	if (off < 0)
490		goto err;
491
492	l2sz = disk->clustersz / 8;
493	l1off = (off / disk->clustersz) / l2sz;
494	if (l1off >= disk->l1sz)
495		goto err;
496
497	l2tab = disk->l1[l1off];
498	l2tab &= ~QCOW2_INPLACE;
499	if (l2tab == 0) {
500		pthread_rwlock_unlock(&disk->lock);
501		return 0;
502	}
503	l2off = (off / disk->clustersz) % l2sz;
504	pread(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8);
505	cluster = be64toh(buf);
506	/*
507	 * cluster may be 0, but all future operations don't affect
508	 * the return value.
509	 */
510	if (inplace)
511		*inplace = !!(cluster & QCOW2_INPLACE);
512	if (cluster & QCOW2_COMPRESSED)
513		fatalx("%s: compressed clusters unsupported", __func__);
514	pthread_rwlock_unlock(&disk->lock);
515	clusteroff = 0;
516	cluster &= ~QCOW2_INPLACE;
517	if (cluster)
518		clusteroff = off % disk->clustersz;
519	return cluster + clusteroff;
520err:
521	pthread_rwlock_unlock(&disk->lock);
522	return -1;
523}
524
525/*
526 * Allocates a new cluster on disk, creating a new L2 table
527 * if needed. The cluster starts off with a refs of one,
528 * and the writable bit set.
529 *
530 * Returns -1 on error, and the physical address within the
531 * cluster of the write offset if it exists.
532 */
533static off_t
534mkcluster(struct qcdisk *disk, struct qcdisk *base, off_t off, off_t src_phys)
535{
536	off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff, orig;
537	uint64_t buf;
538
539	pthread_rwlock_wrlock(&disk->lock);
540
541	cluster = -1;
542	/* L1 entries always exist */
543	l2sz = disk->clustersz / 8;
544	l1off = off / (disk->clustersz * l2sz);
545	if (l1off >= disk->l1sz)
546		fatalx("l1 offset outside disk");
547
548	disk->end = (disk->end + disk->clustersz - 1) & ~(disk->clustersz - 1);
549
550	l2tab = disk->l1[l1off];
551	l2off = (off / disk->clustersz) % l2sz;
552	/* We may need to create or clone an L2 entry to map the block */
553	if (l2tab == 0 || (l2tab & QCOW2_INPLACE) == 0) {
554		orig = l2tab & ~QCOW2_INPLACE;
555		l2tab = disk->end;
556		disk->end += disk->clustersz;
557		if (ftruncate(disk->fd, disk->end) == -1)
558			fatal("%s: ftruncate failed", __func__);
559
560		/*
561		 * If we translated, found a L2 entry, but it needed to
562		 * be copied, copy it.
563		 */
564		if (orig != 0)
565			copy_cluster(disk, disk, l2tab, orig);
566		/* Update l1 -- we flush it later */
567		disk->l1[l1off] = l2tab | QCOW2_INPLACE;
568		inc_refs(disk, l2tab, 1);
569	}
570	l2tab &= ~QCOW2_INPLACE;
571
572	/* Grow the disk */
573	if (ftruncate(disk->fd, disk->end + disk->clustersz) < 0)
574		fatal("%s: could not grow disk", __func__);
575	if (src_phys > 0)
576		copy_cluster(disk, base, disk->end, src_phys);
577	cluster = disk->end;
578	disk->end += disk->clustersz;
579	buf = htobe64(cluster | QCOW2_INPLACE);
580	if (pwrite(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8) != 8)
581		fatalx("%s: could not write cluster", __func__);
582
583	/* TODO: lazily sync: currently VMD doesn't close things */
584	buf = htobe64(disk->l1[l1off]);
585	if (pwrite(disk->fd, &buf, sizeof(buf), disk->l1off + 8 * l1off) != 8)
586		fatalx("%s: could not write l1", __func__);
587	inc_refs(disk, cluster, 1);
588
589	pthread_rwlock_unlock(&disk->lock);
590	clusteroff = off % disk->clustersz;
591	if (cluster + clusteroff < disk->clustersz)
592		fatalx("write would clobber header");
593	return cluster + clusteroff;
594}
595
596/* Copies a cluster containing src to dst. Src and dst need not be aligned. */
597static void
598copy_cluster(struct qcdisk *disk, struct qcdisk *base, off_t dst, off_t src)
599{
600	char *scratch;
601
602	scratch = malloc(disk->clustersz);
603	if (!scratch)
604		fatal("out of memory");
605	src &= ~(disk->clustersz - 1);
606	dst &= ~(disk->clustersz - 1);
607	if (pread(base->fd, scratch, disk->clustersz, src) == -1)
608		fatal("%s: could not read cluster", __func__);
609	if (pwrite(disk->fd, scratch, disk->clustersz, dst) == -1)
610		fatal("%s: could not write cluster", __func__);
611	free(scratch);
612}
613
614static void
615inc_refs(struct qcdisk *disk, off_t off, int newcluster)
616{
617	off_t l1off, l1idx, l2idx, l2cluster;
618	size_t nper;
619	uint16_t refs;
620	uint64_t buf;
621
622	off &= ~QCOW2_INPLACE;
623	nper = disk->clustersz / 2;
624	l1idx = (off / disk->clustersz) / nper;
625	l2idx = (off / disk->clustersz) % nper;
626	l1off = disk->refoff + 8 * l1idx;
627	if (pread(disk->fd, &buf, sizeof(buf), l1off) != 8)
628		fatal("could not read refs");
629
630	l2cluster = be64toh(buf);
631	if (l2cluster == 0) {
632		l2cluster = disk->end;
633		disk->end += disk->clustersz;
634		if (ftruncate(disk->fd, disk->end) < 0)
635			fatal("%s: failed to allocate ref block", __func__);
636		buf = htobe64(l2cluster);
637		if (pwrite(disk->fd, &buf, sizeof(buf), l1off) != 8)
638			fatal("%s: failed to write ref block", __func__);
639	}
640
641	refs = 1;
642	if (!newcluster) {
643		if (pread(disk->fd, &refs, sizeof(refs),
644		    l2cluster + 2 * l2idx) != 2)
645			fatal("could not read ref cluster");
646		refs = be16toh(refs) + 1;
647	}
648	refs = htobe16(refs);
649	if (pwrite(disk->fd, &refs, sizeof(refs), l2cluster + 2 * l2idx) != 2)
650		fatal("%s: could not write ref block", __func__);
651}
652
653/*
654 * virtio_qcow2_create
655 *
656 * Create an empty qcow2 imagefile with the specified path and size.
657 *
658 * Parameters:
659 *  imgfile_path: path to the image file to create
660 *  imgsize     : size of the image file to create (in bytes)
661 *
662 * Return:
663 *  EEXIST: The requested image file already exists
664 *  0     : Image file successfully created
665 *  Exxxx : Various other Exxxx errno codes due to other I/O errors
666 */
667int
668virtio_qcow2_create(const char *imgfile_path,
669    const char *base_path, uint64_t disksz)
670{
671	struct qcheader hdr, basehdr;
672	int fd, ret;
673	ssize_t base_len;
674	uint64_t l1sz, refsz, initsz, clustersz;
675	uint64_t l1off, refoff, v, i, l1entrysz, refentrysz;
676	uint16_t refs;
677
678	if (base_path) {
679		fd = open(base_path, O_RDONLY);
680		if (read(fd, &basehdr, sizeof(basehdr)) != sizeof(basehdr))
681			errx(1, "failure to read base image header");
682		close(fd);
683		if (strncmp(basehdr.magic,
684		    VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0)
685			errx(1, "base image is not a qcow2 file");
686		if (!disksz)
687			disksz = betoh64(basehdr.disksz);
688		else if (disksz != betoh64(basehdr.disksz))
689			errx(1, "base size does not match requested size");
690	}
691	if (!base_path && !disksz)
692		errx(1, "missing disk size");
693
694	clustersz = (1<<16);
695	l1off = ALIGNSZ(sizeof(hdr), clustersz);
696
697	l1entrysz = clustersz * clustersz / 8;
698	l1sz = (disksz + l1entrysz - 1) / l1entrysz;
699
700	refoff = ALIGNSZ(l1off + 8*l1sz, clustersz);
701	refentrysz = clustersz * clustersz * clustersz / 2;
702	refsz = (disksz + refentrysz - 1) / refentrysz;
703
704	initsz = ALIGNSZ(refoff + refsz*clustersz, clustersz);
705	base_len = base_path ? strlen(base_path) : 0;
706
707	memcpy(hdr.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW));
708	hdr.version		= htobe32(3);
709	hdr.backingoff		= htobe64(base_path ? sizeof(hdr) : 0);
710	hdr.backingsz		= htobe32(base_len);
711	hdr.clustershift	= htobe32(16);
712	hdr.disksz		= htobe64(disksz);
713	hdr.cryptmethod		= htobe32(0);
714	hdr.l1sz		= htobe32(l1sz);
715	hdr.l1off		= htobe64(l1off);
716	hdr.refoff		= htobe64(refoff);
717	hdr.refsz		= htobe32(refsz);
718	hdr.snapcount		= htobe32(0);
719	hdr.snapsz		= htobe64(0);
720	hdr.incompatfeatures	= htobe64(0);
721	hdr.compatfeatures	= htobe64(0);
722	hdr.autoclearfeatures	= htobe64(0);
723	hdr.reforder		= htobe32(4);
724	hdr.headersz		= htobe32(sizeof(hdr));
725
726	/* Refuse to overwrite an existing image */
727	fd = open(imgfile_path, O_RDWR | O_CREAT | O_TRUNC | O_EXCL,
728	    S_IRUSR | S_IWUSR);
729	if (fd == -1)
730		return (errno);
731
732	/* Write out the header */
733	if (write(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
734		goto error;
735
736	/* Add the base image */
737	if (base_path && write(fd, base_path, base_len) != base_len)
738		goto error;
739
740	/* Extend to desired size, and add one refcount cluster */
741	if (ftruncate(fd, (off_t)initsz + clustersz) == -1)
742		goto error;
743
744	/*
745	 * Paranoia: if our disk image takes more than one cluster
746	 * to refcount the initial image, fail.
747	 */
748	if (initsz/clustersz > clustersz/2) {
749		errno = ERANGE;
750		goto error;
751	}
752
753	/* Add a refcount block, and refcount ourselves. */
754	v = htobe64(initsz);
755	if (pwrite(fd, &v, 8, refoff) != 8)
756		goto error;
757	for (i = 0; i < initsz/clustersz + 1; i++) {
758		refs = htobe16(1);
759		if (pwrite(fd, &refs, 2, initsz + 2*i) != 2)
760			goto error;
761	}
762
763	ret = close(fd);
764	return (ret);
765error:
766	ret = errno;
767	close(fd);
768	unlink(imgfile_path);
769	return (errno);
770}
771