vioqcow2.c revision 1.2
1/* $OpenBSD: vioqcow2.c,v 1.2 2018/09/11 04:06:32 ccardenas Exp $ */ 2 3/* 4 * Copyright (c) 2018 Ori Bernstein <ori@eigenstate.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19#include <sys/types.h> 20#include <sys/stat.h> 21 22#include <machine/vmmvar.h> 23#include <dev/pci/pcireg.h> 24 25#include <stdlib.h> 26#include <string.h> 27#include <unistd.h> 28#include <fcntl.h> 29#include <assert.h> 30#include <err.h> 31 32#include "vmd.h" 33#include "vmm.h" 34#include "virtio.h" 35 36#define QCOW2_COMPRESSED 0x4000000000000000ull 37#define QCOW2_INPLACE 0x8000000000000000ull 38 39#define QCOW2_DIRTY (1 << 0) 40#define QCOW2_CORRUPT (1 << 1) 41 42enum { 43 ICFEATURE_DIRTY = 1 << 0, 44 ICFEATURE_CORRUPT = 1 << 1, 45}; 46 47enum { 48 ACFEATURE_BITEXT = 1 << 0, 49}; 50 51struct qcheader { 52 char magic[4]; 53 uint32_t version; 54 uint64_t backingoff; 55 uint32_t backingsz; 56 uint32_t clustershift; 57 uint64_t disksz; 58 uint32_t cryptmethod; 59 uint32_t l1sz; 60 uint64_t l1off; 61 uint64_t refoff; 62 uint32_t refsz; 63 uint32_t snapcount; 64 uint64_t snapsz; 65 /* v3 additions */ 66 uint64_t incompatfeatures; 67 uint64_t compatfeatures; 68 uint64_t autoclearfeatures; 69 uint32_t reforder; /* Bits = 1 << reforder */ 70 uint32_t headersz; 71} __packed; 72 73struct qcdisk { 74 pthread_rwlock_t lock; 75 struct qcdisk *base; 76 struct qcheader header; 77 78 int fd; 79 uint64_t *l1; 80 char *scratch; 81 off_t end; 82 uint32_t clustersz; 83 off_t disksz; /* In bytes */ 84 uint32_t cryptmethod; 85 86 uint32_t l1sz; 87 off_t l1off; 88 89 off_t refoff; 90 uint32_t refsz; 91 92 uint32_t nsnap; 93 off_t snapoff; 94 95 /* v3 features */ 96 uint64_t incompatfeatures; 97 uint64_t autoclearfeatures; 98 uint32_t refssz; 99 uint32_t headersz; 100}; 101 102extern char *__progname; 103 104static off_t xlate(struct qcdisk *, off_t, int *); 105static int copy_cluster(struct qcdisk *, struct qcdisk *, off_t, off_t); 106static off_t mkcluster(struct qcdisk *, struct qcdisk *, off_t, off_t); 107static int inc_refs(struct qcdisk *, off_t, int); 108static int qc2_openpath(struct qcdisk *, char *, int); 109static int qc2_open(struct qcdisk *, int); 110static ssize_t qc2_pread(void *, char *, size_t, off_t); 111static ssize_t qc2_pwrite(void *, char *, size_t, off_t); 112static void qc2_close(void *); 113 114/* 115 * Initializes a raw disk image backing file from an fd. 116 * Stores the number of 512 byte sectors in *szp, 117 * returning -1 for error, 0 for success. 118 * 119 * May open snapshot base images. 120 */ 121int 122virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int fd) 123{ 124 struct qcdisk *diskp; 125 126 diskp = malloc(sizeof(struct qcdisk)); 127 if (diskp == NULL) 128 return -1; 129 if (qc2_open(diskp, fd) == -1) { 130 log_warnx("%s: could not open qcow2 disk", __func__); 131 free(diskp); 132 return -1; 133 } 134 file->p = diskp; 135 file->pread = qc2_pread; 136 file->pwrite = qc2_pwrite; 137 file->close = qc2_close; 138 *szp = diskp->disksz; 139 return 0; 140} 141 142static int 143qc2_openpath(struct qcdisk *disk, char *path, int flags) 144{ 145 int fd; 146 147 fd = open(path, flags); 148 if (fd < 0) 149 return -1; 150 return qc2_open(disk, fd); 151} 152 153static int 154qc2_open(struct qcdisk *disk, int fd) 155{ 156 char basepath[PATH_MAX]; 157 struct stat st; 158 struct qcheader header; 159 uint64_t backingoff; 160 uint32_t backingsz; 161 size_t i; 162 int version; 163 164 if (pread(fd, &header, sizeof header, 0) != sizeof header) { 165 log_warn("%s: short read on header", __func__); 166 return -1; 167 } 168 if (strncmp(header.magic, "QFI\xfb", 4) != 0) { 169 log_warn("%s: invalid magic numbers", __func__); 170 return -1; 171 } 172 pthread_rwlock_init(&disk->lock, NULL); 173 disk->fd = fd; 174 disk->base = NULL; 175 176 disk->clustersz = (1ull << be32toh(header.clustershift)); 177 disk->disksz = be64toh(header.disksz); 178 disk->cryptmethod = be32toh(header.cryptmethod); 179 disk->l1sz = be32toh(header.l1sz); 180 disk->l1off = be64toh(header.l1off); 181 disk->refsz = be32toh(header.refsz); 182 disk->refoff = be64toh(header.refoff); 183 disk->nsnap = be32toh(header.snapcount); 184 disk->snapoff = be64toh(header.snapsz); 185 /* 186 * The additional features here are defined as 0 in the v2 format, 187 * so as long as we clear the buffer before parsing, we don't need 188 * to check versions here. 189 */ 190 disk->incompatfeatures = be64toh(header.incompatfeatures); 191 disk->autoclearfeatures = be64toh(header.autoclearfeatures); 192 disk->refssz = be32toh(header.refsz); 193 disk->headersz = be32toh(header.headersz); 194 195 /* 196 * We only know about the dirty or corrupt bits here. 197 */ 198 if (disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT)) { 199 log_warn("%s: unsupported features %llx", __func__, 200 disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT)); 201 return -1; 202 } 203 204 disk->l1 = calloc(disk->l1sz, sizeof *disk->l1); 205 if (pread(disk->fd, (char*)disk->l1, 8*disk->l1sz, disk->l1off) 206 != 8*disk->l1sz) { 207 free(disk->l1); 208 return -1; 209 } 210 for (i = 0; i < disk->l1sz; i++) 211 disk->l1[i] = be64toh(disk->l1[i]); 212 version = be32toh(header.version); 213 if (version != 2 && version != 3) { 214 log_warn("%s: unknown qcow2 version %d", __func__, version); 215 return -1; 216 } 217 218 backingoff = be64toh(header.backingoff); 219 backingsz = be32toh(header.backingsz); 220 if (backingsz != 0) { 221 /* 222 * FIXME: we need to figure out a way of opening these things, 223 * otherwise we just crash with a pledge violation. 224 */ 225 log_warn("%s: unsupported external snapshot images", __func__); 226 return -1; 227 228 if (backingsz >= sizeof basepath - 1) { 229 log_warn("%s: snapshot path too long", __func__); 230 return -1; 231 } 232 if (pread(fd, basepath, backingsz, backingoff) != backingsz) { 233 log_warn("%s: could not read snapshot base name", 234 __func__); 235 return -1; 236 } 237 basepath[backingsz] = 0; 238 239 disk->base = calloc(1, sizeof(struct qcdisk)); 240 if (qc2_openpath(disk->base, basepath, O_RDONLY) == -1) { 241 free(disk->base); 242 return -1; 243 } 244 if (disk->base->clustersz != disk->clustersz) { 245 log_warn("%s: all disks must share clustersize", 246 __func__); 247 free(disk->base); 248 return -1; 249 } 250 } 251 fstat(fd, &st); 252 disk->end = st.st_size; 253 return 0; 254} 255 256static ssize_t 257qc2_pread(void *p, char *buf, size_t len, off_t off) 258{ 259 struct qcdisk *disk, *d; 260 off_t phys_off, end, cluster_off; 261 ssize_t sz, rem; 262 263 disk = p; 264 end = off + len; 265 if (off < 0 || end > disk->disksz) 266 return -1; 267 268 /* handle head chunk separately */ 269 rem = len; 270 while (off != end) { 271 for (d = disk; d; d = d->base) 272 if ((phys_off = xlate(d, off, NULL)) > 0) 273 break; 274 /* Break out into chunks. This handles 275 * three cases: 276 * 277 * |----+====|========|====+ | 278 * 279 * Either we are at the start of the read, 280 * and the cluster has some leading bytes. 281 * This means that we are reading the tail 282 * of the cluster, and our size is: 283 * 284 * clustersz - (off % clustersz). 285 * 286 * Otherwise, we're reading the middle section. 287 * We're already aligned here, so we can just 288 * read the whole cluster size. Or we're at the 289 * tail, at which point we just want to read the 290 * remaining bytes. 291 */ 292 cluster_off = off % disk->clustersz; 293 sz = disk->clustersz - cluster_off; 294 if (sz > rem) 295 sz = rem; 296 /* 297 * If we're within the disk, but don't have backing bytes, 298 * just read back zeros. 299 */ 300 if (!d) 301 bzero(buf, sz); 302 else if (pread(d->fd, buf, sz, phys_off) != sz) 303 return -1; 304 off += sz; 305 buf += sz; 306 rem -= sz; 307 } 308 return len; 309} 310 311ssize_t 312qc2_pwrite(void *p, char *buf, size_t len, off_t off) 313{ 314 struct qcdisk *disk, *d; 315 off_t phys_off, cluster_off, end; 316 ssize_t sz, rem; 317 int inplace; 318 319 d = p; 320 disk = p; 321 inplace = 1; 322 end = off + len; 323 if (off < 0 || end > disk->disksz) 324 return -1; 325 rem = len; 326 while (off != end) { 327 /* See the read code for a summary of the computation */ 328 cluster_off = off % disk->clustersz; 329 sz = disk->clustersz - cluster_off; 330 if (sz > rem) 331 sz = rem; 332 333 phys_off = xlate(disk, off, &inplace); 334 if (phys_off == -1) 335 return -1; 336 /* 337 * If we couldn't find the cluster in the writable disk, 338 * see if it exists in the base image. If it does, we 339 * need to copy it before the write. The copy happens 340 * in the '!inplace' if clause below te search. 341 */ 342 if (phys_off == 0) 343 for (d = disk->base; d; d = d->base) 344 if ((phys_off = xlate(d, off, NULL)) > 0) 345 break; 346 if (!inplace || phys_off == 0) 347 phys_off = mkcluster(disk, d, off, phys_off); 348 if (phys_off == -1) 349 return -1; 350 if (pwrite(disk->fd, buf, sz, phys_off) != sz) 351 return -1; 352 off += sz; 353 buf += sz; 354 rem -= sz; 355 } 356 return len; 357} 358 359static void 360qc2_close(void *p) 361{ 362 struct qcdisk *disk; 363 364 disk = p; 365 pwrite(disk->fd, disk->l1, disk->l1sz, disk->l1off); 366 close(disk->fd); 367 free(disk); 368} 369 370/* 371 * Translates a virtual offset into an on-disk offset. 372 * Returns: 373 * -1 on error 374 * 0 on 'not found' 375 * >0 on found 376 */ 377static off_t 378xlate(struct qcdisk *disk, off_t off, int *inplace) 379{ 380 off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff; 381 uint64_t buf; 382 383 384 /* 385 * Clear out inplace flag -- xlate misses should not 386 * be flagged as updatable in place. We will still 387 * return 0 from them, but this leaves less surprises 388 * in the API. 389 */ 390 if (inplace) 391 *inplace = 0; 392 pthread_rwlock_rdlock(&disk->lock); 393 if (off < 0) 394 goto err; 395 396 l2sz = disk->clustersz / 8; 397 l1off = (off / disk->clustersz) / l2sz; 398 if (l1off >= disk->l1sz) 399 goto err; 400 401 l2tab = disk->l1[l1off]; 402 l2tab &= ~QCOW2_INPLACE; 403 if (l2tab == 0) { 404 pthread_rwlock_unlock(&disk->lock); 405 return 0; 406 } 407 l2off = (off / disk->clustersz) % l2sz; 408 pread(disk->fd, &buf, sizeof(buf), l2tab + l2off*8); 409 cluster = be64toh(buf); 410 /* 411 * cluster may be 0, but all future operations don't affect 412 * the return value. 413 */ 414 if (inplace) 415 *inplace = !!(cluster & QCOW2_INPLACE); 416 if (cluster & QCOW2_COMPRESSED) { 417 log_warn("%s: compressed clusters unsupported", __func__); 418 goto err; 419 } 420 pthread_rwlock_unlock(&disk->lock); 421 clusteroff = 0; 422 cluster &= ~QCOW2_INPLACE; 423 if (cluster) 424 clusteroff = off % disk->clustersz; 425 return cluster + clusteroff; 426err: 427 pthread_rwlock_unlock(&disk->lock); 428 return -1; 429} 430 431/* 432 * Allocates a new cluster on disk, creating a new L2 table 433 * if needed. The cluster starts off with a refs of one, 434 * and the writable bit set. 435 * 436 * Returns -1 on error, and the physical address within the 437 * cluster of the write offset if it exists. 438 */ 439static off_t 440mkcluster(struct qcdisk *disk, struct qcdisk *base, off_t off, off_t src_phys) 441{ 442 off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff, orig; 443 uint64_t buf; 444 int fd; 445 446 pthread_rwlock_wrlock(&disk->lock); 447 448 cluster = -1; 449 fd = disk->fd; 450 /* L1 entries always exist */ 451 l2sz = disk->clustersz / 8; 452 l1off = off / (disk->clustersz * l2sz); 453 if (l1off >= disk->l1sz) 454 goto fail; 455 456 /* 457 * Align disk to cluster size, for ftruncate: Not strictly 458 * required, but it easier to eyeball buggy write offsets, 459 * and helps performance a bit. 460 */ 461 disk->end = (disk->end + disk->clustersz - 1) & ~(disk->clustersz - 1); 462 463 l2tab = disk->l1[l1off]; 464 l2off = (off / disk->clustersz) % l2sz; 465 /* We may need to create or clone an L2 entry to map the block */ 466 if (l2tab == 0 || (l2tab & QCOW2_INPLACE) == 0) { 467 orig = l2tab & ~QCOW2_INPLACE; 468 l2tab = disk->end; 469 disk->end += disk->clustersz; 470 if (ftruncate(disk->fd, disk->end) == -1) { 471 perror("ftruncate"); 472 goto fail; 473 } 474 475 /* 476 * If we translated, found a L2 entry, but it needed to 477 * be copied, copy it. 478 */ 479 if (orig != 0 && copy_cluster(disk, disk, l2tab, orig) == -1) { 480 perror("move cluster"); 481 goto fail; 482 } 483 /* Update l1 -- we flush it later */ 484 disk->l1[l1off] = l2tab | QCOW2_INPLACE; 485 if (inc_refs(disk, l2tab, 1) == -1) { 486 perror("refs"); 487 goto fail; 488 } 489 } 490 l2tab &= ~QCOW2_INPLACE; 491 492 /* Grow the disk */ 493 if (ftruncate(disk->fd, disk->end + disk->clustersz) < 0) 494 goto fail; 495 if (src_phys > 0) 496 if (copy_cluster(disk, base, disk->end, src_phys) == -1) 497 goto fail; 498 cluster = disk->end; 499 disk->end += disk->clustersz; 500 buf = htobe64(cluster | QCOW2_INPLACE); 501 if (pwrite(disk->fd, &buf, sizeof buf, l2tab + l2off*8) != sizeof(buf)) 502 goto fail; 503 504 /* TODO: lazily sync: currently VMD doesn't close things */ 505 buf = htobe64(disk->l1[l1off]); 506 if (pwrite(disk->fd, &buf, sizeof buf, disk->l1off + 8*l1off) != 8) 507 goto fail; 508 if (inc_refs(disk, cluster, 1) == -1) 509 goto fail; 510 511 pthread_rwlock_unlock(&disk->lock); 512 clusteroff = off % disk->clustersz; 513 return cluster + clusteroff; 514 515fail: 516 pthread_rwlock_unlock(&disk->lock); 517 return -1; 518} 519 520/* Copies a cluster containing src to dst. Src and dst need not be aligned. */ 521static int 522copy_cluster(struct qcdisk *disk, struct qcdisk *base, off_t dst, off_t src) 523{ 524 char *scratch; 525 526 scratch = alloca(disk->clustersz); 527 if (!scratch) 528 err(1, "out of memory"); 529 src &= ~(disk->clustersz - 1); 530 dst &= ~(disk->clustersz - 1); 531 if (pread(base->fd, scratch, disk->clustersz, src) == -1) 532 return -1; 533 if (pwrite(disk->fd, scratch, disk->clustersz, dst) == -1) 534 return -1; 535 return 0; 536} 537 538static int 539inc_refs(struct qcdisk *disk, off_t off, int newcluster) 540{ 541 off_t l1off, l1idx, l2idx, l2cluster; 542 size_t nper; 543 uint16_t refs; 544 uint64_t buf; 545 546 off &= ~QCOW2_INPLACE; 547 nper = disk->clustersz / 2; 548 l1idx = (off / disk->clustersz) / nper; 549 l2idx = (off / disk->clustersz) % nper; 550 l1off = disk->refoff + 8*l1idx; 551 if (pread(disk->fd, &buf, sizeof buf, l1off) != 8) 552 return -1; 553 554 l2cluster = be64toh(buf); 555 if (l2cluster == 0) { 556 l2cluster = disk->end; 557 disk->end += disk->clustersz; 558 if (ftruncate(disk->fd, disk->end) < 0) { 559 log_warn("%s: refs block grow fail", __func__); 560 return -1; 561 } 562 buf = htobe64(l2cluster); 563 if (pwrite(disk->fd, &buf, sizeof buf, l1off) != 8) { 564 return -1; 565 } 566 } 567 568 refs = 1; 569 if (!newcluster) { 570 if (pread(disk->fd, &refs, sizeof refs, l2cluster+2*l2idx) != 2) 571 return -1; 572 refs = be16toh(refs) + 1; 573 } 574 refs = htobe16(refs); 575 if (pwrite(disk->fd, &refs, sizeof refs, l2cluster + 2*l2idx) != 2) { 576 log_warn("%s: could not write ref block", __func__); 577 return -1; 578 } 579 return 0; 580} 581 582