vioqcow2.c revision 1.2
1/*	$OpenBSD: vioqcow2.c,v 1.2 2018/09/11 04:06:32 ccardenas Exp $	*/
2
3/*
4 * Copyright (c) 2018 Ori Bernstein <ori@eigenstate.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19#include <sys/types.h>
20#include <sys/stat.h>
21
22#include <machine/vmmvar.h>
23#include <dev/pci/pcireg.h>
24
25#include <stdlib.h>
26#include <string.h>
27#include <unistd.h>
28#include <fcntl.h>
29#include <assert.h>
30#include <err.h>
31
32#include "vmd.h"
33#include "vmm.h"
34#include "virtio.h"
35
36#define QCOW2_COMPRESSED	0x4000000000000000ull
37#define QCOW2_INPLACE		0x8000000000000000ull
38
39#define QCOW2_DIRTY		(1 << 0)
40#define QCOW2_CORRUPT		(1 << 1)
41
42enum {
43	ICFEATURE_DIRTY		= 1 << 0,
44	ICFEATURE_CORRUPT	= 1 << 1,
45};
46
47enum {
48	ACFEATURE_BITEXT	= 1 << 0,
49};
50
51struct qcheader {
52	char magic[4];
53	uint32_t version;
54	uint64_t backingoff;
55	uint32_t backingsz;
56	uint32_t clustershift;
57	uint64_t disksz;
58	uint32_t cryptmethod;
59	uint32_t l1sz;
60	uint64_t l1off;
61	uint64_t refoff;
62	uint32_t refsz;
63	uint32_t snapcount;
64	uint64_t snapsz;
65	/* v3 additions */
66	uint64_t incompatfeatures;
67	uint64_t compatfeatures;
68	uint64_t autoclearfeatures;
69	uint32_t reforder;	/* Bits = 1 << reforder */
70	uint32_t headersz;
71} __packed;
72
73struct qcdisk {
74	pthread_rwlock_t lock;
75	struct qcdisk *base;
76	struct qcheader header;
77
78	int       fd;
79	uint64_t *l1;
80	char     *scratch;
81	off_t     end;
82	uint32_t  clustersz;
83	off_t	  disksz; /* In bytes */
84	uint32_t cryptmethod;
85
86	uint32_t l1sz;
87	off_t	 l1off;
88
89	off_t	 refoff;
90	uint32_t refsz;
91
92	uint32_t nsnap;
93	off_t	 snapoff;
94
95	/* v3 features */
96	uint64_t incompatfeatures;
97	uint64_t autoclearfeatures;
98	uint32_t refssz;
99	uint32_t headersz;
100};
101
102extern char *__progname;
103
104static off_t xlate(struct qcdisk *, off_t, int *);
105static int copy_cluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
106static off_t mkcluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
107static int inc_refs(struct qcdisk *, off_t, int);
108static int qc2_openpath(struct qcdisk *, char *, int);
109static int qc2_open(struct qcdisk *, int);
110static ssize_t qc2_pread(void *, char *, size_t, off_t);
111static ssize_t qc2_pwrite(void *, char *, size_t, off_t);
112static void qc2_close(void *);
113
114/*
115 * Initializes a raw disk image backing file from an fd.
116 * Stores the number of 512 byte sectors in *szp,
117 * returning -1 for error, 0 for success.
118 *
119 * May open snapshot base images.
120 */
121int
122virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int fd)
123{
124	struct qcdisk *diskp;
125
126	diskp = malloc(sizeof(struct qcdisk));
127	if (diskp == NULL)
128		return -1;
129	if (qc2_open(diskp, fd) == -1) {
130		log_warnx("%s: could not open qcow2 disk", __func__);
131		free(diskp);
132		return -1;
133	}
134	file->p = diskp;
135	file->pread = qc2_pread;
136	file->pwrite = qc2_pwrite;
137	file->close = qc2_close;
138	*szp = diskp->disksz;
139	return 0;
140}
141
142static int
143qc2_openpath(struct qcdisk *disk, char *path, int flags)
144{
145	int fd;
146
147	fd = open(path, flags);
148	if (fd < 0)
149		return -1;
150	return qc2_open(disk, fd);
151}
152
153static int
154qc2_open(struct qcdisk *disk, int fd)
155{
156	char basepath[PATH_MAX];
157	struct stat st;
158	struct qcheader header;
159	uint64_t backingoff;
160	uint32_t backingsz;
161	size_t i;
162	int version;
163
164	if (pread(fd, &header, sizeof header, 0) != sizeof header) {
165		log_warn("%s: short read on header", __func__);
166		return -1;
167	}
168	if (strncmp(header.magic, "QFI\xfb", 4) != 0) {
169		log_warn("%s: invalid magic numbers", __func__);
170		return -1;
171	}
172	pthread_rwlock_init(&disk->lock, NULL);
173	disk->fd = fd;
174	disk->base = NULL;
175
176	disk->clustersz		= (1ull << be32toh(header.clustershift));
177	disk->disksz		= be64toh(header.disksz);
178	disk->cryptmethod	= be32toh(header.cryptmethod);
179	disk->l1sz		= be32toh(header.l1sz);
180	disk->l1off		= be64toh(header.l1off);
181	disk->refsz		= be32toh(header.refsz);
182	disk->refoff		= be64toh(header.refoff);
183	disk->nsnap		= be32toh(header.snapcount);
184	disk->snapoff		= be64toh(header.snapsz);
185	/*
186	 * The additional features here are defined as 0 in the v2 format,
187	 * so as long as we clear the buffer before parsing, we don't need
188	 * to check versions here.
189	 */
190	disk->incompatfeatures = be64toh(header.incompatfeatures);
191	disk->autoclearfeatures = be64toh(header.autoclearfeatures);
192	disk->refssz = be32toh(header.refsz);
193	disk->headersz = be32toh(header.headersz);
194
195	/*
196	 * We only know about the dirty or corrupt bits here.
197	 */
198	if (disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT)) {
199		log_warn("%s: unsupported features %llx", __func__,
200		    disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT));
201		return -1;
202	}
203
204	disk->l1 = calloc(disk->l1sz, sizeof *disk->l1);
205	if (pread(disk->fd, (char*)disk->l1, 8*disk->l1sz, disk->l1off)
206	    != 8*disk->l1sz) {
207		free(disk->l1);
208		return -1;
209	}
210	for (i = 0; i < disk->l1sz; i++)
211		disk->l1[i] = be64toh(disk->l1[i]);
212	version = be32toh(header.version);
213	if (version != 2 && version != 3) {
214		log_warn("%s: unknown qcow2 version %d", __func__, version);
215		return -1;
216	}
217
218	backingoff = be64toh(header.backingoff);
219	backingsz = be32toh(header.backingsz);
220	if (backingsz != 0) {
221		/*
222		 * FIXME: we need to figure out a way of opening these things,
223		 * otherwise we just crash with a pledge violation.
224		 */
225		log_warn("%s: unsupported external snapshot images", __func__);
226		return -1;
227
228		if (backingsz >= sizeof basepath - 1) {
229			log_warn("%s: snapshot path too long", __func__);
230			return -1;
231		}
232		if (pread(fd, basepath, backingsz, backingoff) != backingsz) {
233			log_warn("%s: could not read snapshot base name",
234			    __func__);
235			return -1;
236		}
237		basepath[backingsz] = 0;
238
239		disk->base = calloc(1, sizeof(struct qcdisk));
240		if (qc2_openpath(disk->base, basepath, O_RDONLY) == -1) {
241			free(disk->base);
242			return -1;
243		}
244		if (disk->base->clustersz != disk->clustersz) {
245			log_warn("%s: all disks must share clustersize",
246			    __func__);
247			free(disk->base);
248			return -1;
249		}
250	}
251	fstat(fd, &st);
252	disk->end = st.st_size;
253	return 0;
254}
255
256static ssize_t
257qc2_pread(void *p, char *buf, size_t len, off_t off)
258{
259	struct qcdisk *disk, *d;
260	off_t phys_off, end, cluster_off;
261	ssize_t sz, rem;
262
263	disk = p;
264	end = off + len;
265	if (off < 0 || end > disk->disksz)
266		return -1;
267
268	/* handle head chunk separately */
269	rem = len;
270	while (off != end) {
271		for (d = disk; d; d = d->base)
272			if ((phys_off = xlate(d, off, NULL)) > 0)
273				break;
274		/* Break out into chunks. This handles
275		 * three cases:
276		 *
277		 *    |----+====|========|====+    |
278		 *
279		 * Either we are at the start of the read,
280		 * and the cluster has some leading bytes.
281		 * This means that we are reading the tail
282		 * of the cluster, and our size is:
283		 *
284		 * 	clustersz - (off % clustersz).
285		 *
286		 * Otherwise, we're reading the middle section.
287		 * We're already aligned here, so we can just
288		 * read the whole cluster size. Or we're at the
289		 * tail, at which point we just want to read the
290		 * remaining bytes.
291		 */
292		cluster_off = off % disk->clustersz;
293		sz = disk->clustersz - cluster_off;
294		if (sz > rem)
295			sz = rem;
296		/*
297		 * If we're within the disk, but don't have backing bytes,
298		 * just read back zeros.
299		 */
300		if (!d)
301			bzero(buf, sz);
302		else if (pread(d->fd, buf, sz, phys_off) != sz)
303			return -1;
304		off += sz;
305		buf += sz;
306		rem -= sz;
307	}
308	return len;
309}
310
311ssize_t
312qc2_pwrite(void *p, char *buf, size_t len, off_t off)
313{
314	struct qcdisk *disk, *d;
315	off_t phys_off, cluster_off, end;
316	ssize_t sz, rem;
317	int inplace;
318
319	d = p;
320	disk = p;
321	inplace = 1;
322	end = off + len;
323	if (off < 0 || end > disk->disksz)
324		return -1;
325	rem = len;
326	while (off != end) {
327		/* See the read code for a summary of the computation */
328		cluster_off = off % disk->clustersz;
329		sz = disk->clustersz - cluster_off;
330		if (sz > rem)
331			sz = rem;
332
333		phys_off = xlate(disk, off, &inplace);
334		if (phys_off == -1)
335			return -1;
336		/*
337		 * If we couldn't find the cluster in the writable disk,
338		 * see if it exists in the base image. If it does, we
339		 * need to copy it before the write. The copy happens
340		 * in the '!inplace' if clause below te search.
341		 */
342		if (phys_off == 0)
343			for (d = disk->base; d; d = d->base)
344				if ((phys_off = xlate(d, off, NULL)) > 0)
345					break;
346		if (!inplace || phys_off == 0)
347			phys_off = mkcluster(disk, d, off, phys_off);
348		if (phys_off == -1)
349			return -1;
350		if (pwrite(disk->fd, buf, sz, phys_off) != sz)
351			return -1;
352		off += sz;
353		buf += sz;
354		rem -= sz;
355	}
356	return len;
357}
358
359static void
360qc2_close(void *p)
361{
362	struct qcdisk *disk;
363
364	disk = p;
365	pwrite(disk->fd, disk->l1, disk->l1sz, disk->l1off);
366	close(disk->fd);
367	free(disk);
368}
369
370/*
371 * Translates a virtual offset into an on-disk offset.
372 * Returns:
373 * 	-1 on error
374 * 	 0 on 'not found'
375 * 	>0 on found
376 */
377static off_t
378xlate(struct qcdisk *disk, off_t off, int *inplace)
379{
380	off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff;
381	uint64_t buf;
382
383
384	/*
385	 * Clear out inplace flag -- xlate misses should not
386	 * be flagged as updatable in place. We will still
387	 * return 0 from them, but this leaves less surprises
388	 * in the API.
389	 */
390	if (inplace)
391		*inplace = 0;
392	pthread_rwlock_rdlock(&disk->lock);
393	if (off < 0)
394		goto err;
395
396	l2sz = disk->clustersz / 8;
397	l1off = (off / disk->clustersz) / l2sz;
398	if (l1off >= disk->l1sz)
399		goto err;
400
401	l2tab = disk->l1[l1off];
402	l2tab &= ~QCOW2_INPLACE;
403	if (l2tab == 0) {
404		pthread_rwlock_unlock(&disk->lock);
405		return 0;
406	}
407	l2off = (off / disk->clustersz) % l2sz;
408	pread(disk->fd, &buf, sizeof(buf), l2tab + l2off*8);
409	cluster = be64toh(buf);
410	/*
411	 * cluster may be 0, but all future operations don't affect
412	 * the return value.
413	 */
414	if (inplace)
415		*inplace = !!(cluster & QCOW2_INPLACE);
416	if (cluster & QCOW2_COMPRESSED) {
417		log_warn("%s: compressed clusters unsupported", __func__);
418		goto err;
419	}
420	pthread_rwlock_unlock(&disk->lock);
421	clusteroff = 0;
422	cluster &= ~QCOW2_INPLACE;
423	if (cluster)
424		clusteroff = off % disk->clustersz;
425	return cluster + clusteroff;
426err:
427	pthread_rwlock_unlock(&disk->lock);
428	return -1;
429}
430
431/*
432 * Allocates a new cluster on disk, creating a new L2 table
433 * if needed. The cluster starts off with a refs of one,
434 * and the writable bit set.
435 *
436 * Returns -1 on error, and the physical address within the
437 * cluster of the write offset if it exists.
438 */
439static off_t
440mkcluster(struct qcdisk *disk, struct qcdisk *base, off_t off, off_t src_phys)
441{
442	off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff, orig;
443	uint64_t buf;
444	int fd;
445
446	pthread_rwlock_wrlock(&disk->lock);
447
448	cluster = -1;
449	fd = disk->fd;
450	/* L1 entries always exist */
451	l2sz = disk->clustersz / 8;
452	l1off = off / (disk->clustersz * l2sz);
453	if (l1off >= disk->l1sz)
454		goto fail;
455
456	/*
457	 * Align disk to cluster size, for ftruncate: Not strictly
458	 * required, but it easier to eyeball buggy write offsets,
459	 * and helps performance a bit.
460	 */
461	disk->end = (disk->end + disk->clustersz - 1) & ~(disk->clustersz - 1);
462
463	l2tab = disk->l1[l1off];
464	l2off = (off / disk->clustersz) % l2sz;
465	/* We may need to create or clone an L2 entry to map the block */
466	if (l2tab == 0 || (l2tab & QCOW2_INPLACE) == 0) {
467		orig = l2tab & ~QCOW2_INPLACE;
468		l2tab = disk->end;
469		disk->end += disk->clustersz;
470		if (ftruncate(disk->fd, disk->end) == -1) {
471			perror("ftruncate");
472			goto fail;
473		}
474
475		/*
476		 * If we translated, found a L2 entry, but it needed to
477		 * be copied, copy it.
478		 */
479		if (orig != 0 && copy_cluster(disk, disk, l2tab, orig) == -1) {
480			perror("move cluster");
481			goto fail;
482		}
483		/* Update l1 -- we flush it later */
484		disk->l1[l1off] = l2tab | QCOW2_INPLACE;
485		if (inc_refs(disk, l2tab, 1) == -1) {
486			perror("refs");
487			goto fail;
488		}
489	}
490	l2tab &= ~QCOW2_INPLACE;
491
492	/* Grow the disk */
493	if (ftruncate(disk->fd, disk->end + disk->clustersz) < 0)
494		goto fail;
495	if (src_phys > 0)
496		if (copy_cluster(disk, base, disk->end, src_phys) == -1)
497			goto fail;
498	cluster = disk->end;
499	disk->end += disk->clustersz;
500	buf = htobe64(cluster | QCOW2_INPLACE);
501	if (pwrite(disk->fd, &buf, sizeof buf, l2tab + l2off*8) != sizeof(buf))
502		goto fail;
503
504	/* TODO: lazily sync: currently VMD doesn't close things */
505	buf = htobe64(disk->l1[l1off]);
506	if (pwrite(disk->fd, &buf, sizeof buf, disk->l1off + 8*l1off) != 8)
507		goto fail;
508	if (inc_refs(disk, cluster, 1) == -1)
509		goto fail;
510
511	pthread_rwlock_unlock(&disk->lock);
512	clusteroff = off % disk->clustersz;
513	return cluster + clusteroff;
514
515fail:
516	pthread_rwlock_unlock(&disk->lock);
517	return -1;
518}
519
520/* Copies a cluster containing src to dst. Src and dst need not be aligned. */
521static int
522copy_cluster(struct qcdisk *disk, struct qcdisk *base, off_t dst, off_t src)
523{
524	char *scratch;
525
526	scratch = alloca(disk->clustersz);
527	if (!scratch)
528		err(1, "out of memory");
529	src &= ~(disk->clustersz - 1);
530	dst &= ~(disk->clustersz - 1);
531	if (pread(base->fd, scratch, disk->clustersz, src) == -1)
532		return -1;
533	if (pwrite(disk->fd, scratch, disk->clustersz, dst) == -1)
534		return -1;
535	return 0;
536}
537
538static int
539inc_refs(struct qcdisk *disk, off_t off, int newcluster)
540{
541	off_t l1off, l1idx, l2idx, l2cluster;
542	size_t nper;
543	uint16_t refs;
544	uint64_t buf;
545
546	off &= ~QCOW2_INPLACE;
547	nper = disk->clustersz / 2;
548	l1idx = (off / disk->clustersz) / nper;
549	l2idx = (off / disk->clustersz) % nper;
550	l1off = disk->refoff + 8*l1idx;
551	if (pread(disk->fd, &buf, sizeof buf, l1off) != 8)
552		return -1;
553
554	l2cluster = be64toh(buf);
555	if (l2cluster == 0) {
556		l2cluster = disk->end;
557		disk->end += disk->clustersz;
558		if (ftruncate(disk->fd, disk->end) < 0) {
559			log_warn("%s: refs block grow fail", __func__);
560			return -1;
561		}
562		buf = htobe64(l2cluster);
563		if (pwrite(disk->fd, &buf, sizeof buf, l1off) != 8) {
564			return -1;
565		}
566	}
567
568	refs = 1;
569	if (!newcluster) {
570		if (pread(disk->fd, &refs, sizeof refs, l2cluster+2*l2idx) != 2)
571			return -1;
572		refs = be16toh(refs) + 1;
573	}
574	refs = htobe16(refs);
575	if (pwrite(disk->fd, &refs, sizeof refs, l2cluster + 2*l2idx) != 2) {
576		log_warn("%s: could not write ref block", __func__);
577		return -1;
578	}
579	return 0;
580}
581
582