vioqcow2.c revision 1.13
1/* $OpenBSD: vioqcow2.c,v 1.13 2019/01/10 19:21:02 deraadt Exp $ */ 2 3/* 4 * Copyright (c) 2018 Ori Bernstein <ori@eigenstate.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19#include <sys/types.h> 20#include <sys/stat.h> 21 22#include <machine/vmmvar.h> 23#include <dev/pci/pcireg.h> 24 25#include <stdlib.h> 26#include <string.h> 27#include <unistd.h> 28#include <fcntl.h> 29#include <assert.h> 30#include <libgen.h> 31#include <err.h> 32#include <errno.h> 33 34#include "vmd.h" 35#include "vmm.h" 36#include "virtio.h" 37 38#define QCOW2_COMPRESSED 0x4000000000000000ull 39#define QCOW2_INPLACE 0x8000000000000000ull 40 41#define QCOW2_DIRTY (1 << 0) 42#define QCOW2_CORRUPT (1 << 1) 43 44enum { 45 ICFEATURE_DIRTY = 1 << 0, 46 ICFEATURE_CORRUPT = 1 << 1, 47}; 48 49enum { 50 ACFEATURE_BITEXT = 1 << 0, 51}; 52 53struct qcheader { 54 char magic[4]; 55 uint32_t version; 56 uint64_t backingoff; 57 uint32_t backingsz; 58 uint32_t clustershift; 59 uint64_t disksz; 60 uint32_t cryptmethod; 61 uint32_t l1sz; 62 uint64_t l1off; 63 uint64_t refoff; 64 uint32_t refsz; 65 uint32_t snapcount; 66 uint64_t snapsz; 67 /* v3 additions */ 68 uint64_t incompatfeatures; 69 uint64_t compatfeatures; 70 uint64_t autoclearfeatures; 71 uint32_t reforder; /* Bits = 1 << reforder */ 72 uint32_t headersz; 73} __packed; 74 75struct qcdisk { 76 pthread_rwlock_t lock; 77 struct qcdisk *base; 78 struct qcheader header; 79 80 int fd; 81 uint64_t *l1; 82 off_t end; 83 off_t clustersz; 84 off_t disksz; /* In bytes */ 85 uint32_t cryptmethod; 86 87 uint32_t l1sz; 88 off_t l1off; 89 90 off_t refoff; 91 off_t refsz; 92 93 uint32_t nsnap; 94 off_t snapoff; 95 96 /* v3 features */ 97 uint64_t incompatfeatures; 98 uint64_t autoclearfeatures; 99 uint32_t refssz; 100 uint32_t headersz; 101}; 102 103extern char *__progname; 104 105static off_t xlate(struct qcdisk *, off_t, int *); 106static void copy_cluster(struct qcdisk *, struct qcdisk *, off_t, off_t); 107static void inc_refs(struct qcdisk *, off_t, int); 108static off_t mkcluster(struct qcdisk *, struct qcdisk *, off_t, off_t); 109static int qc2_open(struct qcdisk *, int *, size_t); 110static ssize_t qc2_pread(void *, char *, size_t, off_t); 111static ssize_t qc2_pwrite(void *, char *, size_t, off_t); 112static void qc2_close(void *, int); 113 114/* 115 * Initializes a raw disk image backing file from an fd. 116 * Stores the number of 512 byte sectors in *szp, 117 * returning -1 for error, 0 for success. 118 * 119 * May open snapshot base images. 120 */ 121int 122virtio_qcow2_init(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd) 123{ 124 struct qcdisk *diskp; 125 126 diskp = malloc(sizeof(struct qcdisk)); 127 if (diskp == NULL) 128 return -1; 129 if (qc2_open(diskp, fd, nfd) == -1) { 130 log_warnx("could not open qcow2 disk"); 131 return -1; 132 } 133 file->p = diskp; 134 file->pread = qc2_pread; 135 file->pwrite = qc2_pwrite; 136 file->close = qc2_close; 137 *szp = diskp->disksz; 138 return 0; 139} 140 141/* 142 * Return the path to the base image given a disk image. 143 * Called from vmctl. 144 */ 145ssize_t 146virtio_qcow2_get_base(int fd, char *path, size_t npath, const char *dpath) 147{ 148 char expanded[PATH_MAX]; 149 struct qcheader header; 150 uint64_t backingoff; 151 uint32_t backingsz; 152 char *s = NULL; 153 154 if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) { 155 log_warnx("short read on header"); 156 return -1; 157 } 158 if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0) { 159 log_warnx("invalid magic numbers"); 160 return -1; 161 } 162 backingoff = be64toh(header.backingoff); 163 backingsz = be32toh(header.backingsz); 164 if (backingsz == 0) 165 return 0; 166 167 if (backingsz >= npath - 1) { 168 log_warnx("snapshot path too long"); 169 return -1; 170 } 171 if (pread(fd, path, backingsz, backingoff) != backingsz) { 172 log_warnx("could not read snapshot base name"); 173 return -1; 174 } 175 path[backingsz] = '\0'; 176 177 /* 178 * Relative paths should be interpreted relative to the disk image, 179 * rather than relative to the directory vmd happens to be running in, 180 * since this is the only userful interpretation. 181 */ 182 if (path[0] == '/') { 183 if (realpath(path, expanded) == NULL || 184 strlcpy(path, expanded, npath) >= npath) { 185 log_warnx("unable to resolve %s", path); 186 return -1; 187 } 188 } else { 189 s = dirname(dpath); 190 if (snprintf(expanded, sizeof(expanded), 191 "%s/%s", s, path) >= (int)sizeof(expanded)) { 192 log_warnx("path too long: %s/%s", s, path); 193 return -1; 194 } 195 if (npath < PATH_MAX || 196 realpath(expanded, path) == NULL) { 197 log_warnx("unable to resolve %s", path); 198 return -1; 199 } 200 } 201 202 return strlen(path); 203} 204 205static int 206qc2_open(struct qcdisk *disk, int *fds, size_t nfd) 207{ 208 char basepath[PATH_MAX]; 209 struct stat st; 210 struct qcheader header; 211 uint64_t backingoff; 212 uint32_t backingsz; 213 off_t i; 214 int version, fd; 215 216 pthread_rwlock_init(&disk->lock, NULL); 217 fd = fds[0]; 218 disk->fd = fd; 219 disk->base = NULL; 220 disk->l1 = NULL; 221 222 if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) 223 fatalx("short read on header"); 224 if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0) 225 fatalx("invalid magic numbers"); 226 227 disk->clustersz = (1ull << be32toh(header.clustershift)); 228 disk->disksz = be64toh(header.disksz); 229 disk->cryptmethod = be32toh(header.cryptmethod); 230 disk->l1sz = be32toh(header.l1sz); 231 disk->l1off = be64toh(header.l1off); 232 disk->refsz = be32toh(header.refsz); 233 disk->refoff = be64toh(header.refoff); 234 disk->nsnap = be32toh(header.snapcount); 235 disk->snapoff = be64toh(header.snapsz); 236 237 /* 238 * The additional features here are defined as 0 in the v2 format, 239 * so as long as we clear the buffer before parsing, we don't need 240 * to check versions here. 241 */ 242 disk->incompatfeatures = be64toh(header.incompatfeatures); 243 disk->autoclearfeatures = be64toh(header.autoclearfeatures); 244 disk->refssz = be32toh(header.refsz); 245 disk->headersz = be32toh(header.headersz); 246 247 /* 248 * We only know about the dirty or corrupt bits here. 249 */ 250 if (disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT)) 251 fatalx("unsupported features %llx", 252 disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT)); 253 if (be32toh(header.reforder) != 4) 254 fatalx("unsupported refcount size\n"); 255 256 disk->l1 = calloc(disk->l1sz, sizeof(*disk->l1)); 257 if (!disk->l1) 258 fatal("%s: could not allocate l1 table", __func__); 259 if (pread(disk->fd, disk->l1, 8 * disk->l1sz, disk->l1off) 260 != 8 * disk->l1sz) 261 fatalx("%s: unable to read qcow2 L1 table", __func__); 262 for (i = 0; i < disk->l1sz; i++) 263 disk->l1[i] = be64toh(disk->l1[i]); 264 version = be32toh(header.version); 265 if (version != 2 && version != 3) 266 fatalx("%s: unknown qcow2 version %d", __func__, version); 267 268 backingoff = be64toh(header.backingoff); 269 backingsz = be32toh(header.backingsz); 270 if (backingsz != 0) { 271 if (backingsz >= sizeof(basepath) - 1) { 272 fatalx("%s: snapshot path too long", __func__); 273 } 274 if (pread(fd, basepath, backingsz, backingoff) != backingsz) { 275 fatalx("%s: could not read snapshot base name", 276 __func__); 277 } 278 basepath[backingsz] = 0; 279 if (nfd <= 1) { 280 fatalx("%s: missing base image %s", __func__, 281 basepath); 282 } 283 284 285 disk->base = calloc(1, sizeof(struct qcdisk)); 286 if (!disk->base) 287 fatal("%s: could not open %s", __func__, basepath); 288 if (qc2_open(disk->base, fds + 1, nfd - 1) == -1) 289 fatalx("%s: could not open %s", __func__, basepath); 290 if (disk->base->clustersz != disk->clustersz) 291 fatalx("%s: all disk parts must share clustersize", 292 __func__); 293 } 294 if (fstat(fd, &st) == -1) 295 fatal("%s: unable to stat disk", __func__); 296 297 disk->end = st.st_size; 298 299 log_debug("%s: qcow2 disk version %d size %lld end %lld snap %d", 300 __func__, version, disk->disksz, disk->end, disk->nsnap); 301 302 return 0; 303} 304 305static ssize_t 306qc2_pread(void *p, char *buf, size_t len, off_t off) 307{ 308 struct qcdisk *disk, *d; 309 off_t phys_off, end, cluster_off; 310 ssize_t sz, rem; 311 312 disk = p; 313 end = off + len; 314 if (off < 0 || end > disk->disksz) 315 return -1; 316 317 /* handle head chunk separately */ 318 rem = len; 319 while (off != end) { 320 for (d = disk; d; d = d->base) 321 if ((phys_off = xlate(d, off, NULL)) > 0) 322 break; 323 /* Break out into chunks. This handles 324 * three cases: 325 * 326 * |----+====|========|====+-----| 327 * 328 * Either we are at the start of the read, 329 * and the cluster has some leading bytes. 330 * This means that we are reading the tail 331 * of the cluster, and our size is: 332 * 333 * clustersz - (off % clustersz). 334 * 335 * Otherwise, we're reading the middle section. 336 * We're already aligned here, so we can just 337 * read the whole cluster size. Or we're at the 338 * tail, at which point we just want to read the 339 * remaining bytes. 340 */ 341 cluster_off = off % disk->clustersz; 342 sz = disk->clustersz - cluster_off; 343 if (sz > rem) 344 sz = rem; 345 /* 346 * If we're within the disk, but don't have backing bytes, 347 * just read back zeros. 348 */ 349 if (!d) 350 bzero(buf, sz); 351 else if (pread(d->fd, buf, sz, phys_off) != sz) 352 return -1; 353 off += sz; 354 buf += sz; 355 rem -= sz; 356 } 357 return len; 358} 359 360ssize_t 361qc2_pwrite(void *p, char *buf, size_t len, off_t off) 362{ 363 struct qcdisk *disk, *d; 364 off_t phys_off, cluster_off, end; 365 ssize_t sz, rem; 366 int inplace; 367 368 d = p; 369 disk = p; 370 inplace = 1; 371 end = off + len; 372 if (off < 0 || end > disk->disksz) 373 return -1; 374 rem = len; 375 while (off != end) { 376 /* See the read code for a summary of the computation */ 377 cluster_off = off % disk->clustersz; 378 sz = disk->clustersz - cluster_off; 379 if (sz > rem) 380 sz = rem; 381 382 phys_off = xlate(disk, off, &inplace); 383 if (phys_off == -1) 384 return -1; 385 /* 386 * If we couldn't find the cluster in the writable disk, 387 * see if it exists in the base image. If it does, we 388 * need to copy it before the write. The copy happens 389 * in the '!inplace' if clause below te search. 390 */ 391 if (phys_off == 0) 392 for (d = disk->base; d; d = d->base) 393 if ((phys_off = xlate(d, off, NULL)) > 0) 394 break; 395 if (!inplace || phys_off == 0) 396 phys_off = mkcluster(disk, d, off, phys_off); 397 if (phys_off == -1) 398 return -1; 399 if (phys_off < disk->clustersz) 400 fatalx("%s: writing reserved cluster", __func__); 401 if (pwrite(disk->fd, buf, sz, phys_off) != sz) 402 return -1; 403 off += sz; 404 buf += sz; 405 rem -= sz; 406 } 407 return len; 408} 409 410static void 411qc2_close(void *p, int stayopen) 412{ 413 struct qcdisk *disk; 414 415 disk = p; 416 if (disk->base) 417 qc2_close(disk->base, stayopen); 418 if (!stayopen) 419 close(disk->fd); 420 free(disk->l1); 421 free(disk); 422} 423 424/* 425 * Translates a virtual offset into an on-disk offset. 426 * Returns: 427 * -1 on error 428 * 0 on 'not found' 429 * >0 on found 430 */ 431static off_t 432xlate(struct qcdisk *disk, off_t off, int *inplace) 433{ 434 off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff; 435 uint64_t buf; 436 437 438 /* 439 * Clear out inplace flag -- xlate misses should not 440 * be flagged as updatable in place. We will still 441 * return 0 from them, but this leaves less surprises 442 * in the API. 443 */ 444 if (inplace) 445 *inplace = 0; 446 pthread_rwlock_rdlock(&disk->lock); 447 if (off < 0) 448 goto err; 449 450 l2sz = disk->clustersz / 8; 451 l1off = (off / disk->clustersz) / l2sz; 452 if (l1off >= disk->l1sz) 453 goto err; 454 455 l2tab = disk->l1[l1off]; 456 l2tab &= ~QCOW2_INPLACE; 457 if (l2tab == 0) { 458 pthread_rwlock_unlock(&disk->lock); 459 return 0; 460 } 461 l2off = (off / disk->clustersz) % l2sz; 462 pread(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8); 463 cluster = be64toh(buf); 464 /* 465 * cluster may be 0, but all future operations don't affect 466 * the return value. 467 */ 468 if (inplace) 469 *inplace = !!(cluster & QCOW2_INPLACE); 470 if (cluster & QCOW2_COMPRESSED) 471 fatalx("%s: compressed clusters unsupported", __func__); 472 pthread_rwlock_unlock(&disk->lock); 473 clusteroff = 0; 474 cluster &= ~QCOW2_INPLACE; 475 if (cluster) 476 clusteroff = off % disk->clustersz; 477 return cluster + clusteroff; 478err: 479 pthread_rwlock_unlock(&disk->lock); 480 return -1; 481} 482 483/* 484 * Allocates a new cluster on disk, creating a new L2 table 485 * if needed. The cluster starts off with a refs of one, 486 * and the writable bit set. 487 * 488 * Returns -1 on error, and the physical address within the 489 * cluster of the write offset if it exists. 490 */ 491static off_t 492mkcluster(struct qcdisk *disk, struct qcdisk *base, off_t off, off_t src_phys) 493{ 494 off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff, orig; 495 uint64_t buf; 496 int fd; 497 498 pthread_rwlock_wrlock(&disk->lock); 499 500 cluster = -1; 501 fd = disk->fd; 502 /* L1 entries always exist */ 503 l2sz = disk->clustersz / 8; 504 l1off = off / (disk->clustersz * l2sz); 505 if (l1off >= disk->l1sz) 506 fatalx("l1 offset outside disk"); 507 508 disk->end = (disk->end + disk->clustersz - 1) & ~(disk->clustersz - 1); 509 510 l2tab = disk->l1[l1off]; 511 l2off = (off / disk->clustersz) % l2sz; 512 /* We may need to create or clone an L2 entry to map the block */ 513 if (l2tab == 0 || (l2tab & QCOW2_INPLACE) == 0) { 514 orig = l2tab & ~QCOW2_INPLACE; 515 l2tab = disk->end; 516 disk->end += disk->clustersz; 517 if (ftruncate(disk->fd, disk->end) == -1) 518 fatal("%s: ftruncate failed", __func__); 519 520 /* 521 * If we translated, found a L2 entry, but it needed to 522 * be copied, copy it. 523 */ 524 if (orig != 0) 525 copy_cluster(disk, disk, l2tab, orig); 526 /* Update l1 -- we flush it later */ 527 disk->l1[l1off] = l2tab | QCOW2_INPLACE; 528 inc_refs(disk, l2tab, 1); 529 } 530 l2tab &= ~QCOW2_INPLACE; 531 532 /* Grow the disk */ 533 if (ftruncate(disk->fd, disk->end + disk->clustersz) < 0) 534 fatalx("%s: could not grow disk", __func__); 535 if (src_phys > 0) 536 copy_cluster(disk, base, disk->end, src_phys); 537 cluster = disk->end; 538 disk->end += disk->clustersz; 539 buf = htobe64(cluster | QCOW2_INPLACE); 540 if (pwrite(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8) != 8) 541 fatalx("%s: could not write cluster", __func__); 542 543 /* TODO: lazily sync: currently VMD doesn't close things */ 544 buf = htobe64(disk->l1[l1off]); 545 if (pwrite(disk->fd, &buf, sizeof(buf), disk->l1off + 8 * l1off) != 8) 546 fatalx("%s: could not write l1", __func__); 547 inc_refs(disk, cluster, 1); 548 549 pthread_rwlock_unlock(&disk->lock); 550 clusteroff = off % disk->clustersz; 551 if (cluster + clusteroff < disk->clustersz) 552 fatalx("write would clobber header"); 553 return cluster + clusteroff; 554} 555 556/* Copies a cluster containing src to dst. Src and dst need not be aligned. */ 557static void 558copy_cluster(struct qcdisk *disk, struct qcdisk *base, off_t dst, off_t src) 559{ 560 char *scratch; 561 562 scratch = malloc(disk->clustersz); 563 if (!scratch) 564 fatal("out of memory"); 565 src &= ~(disk->clustersz - 1); 566 dst &= ~(disk->clustersz - 1); 567 if (pread(base->fd, scratch, disk->clustersz, src) == -1) 568 fatal("%s: could not read cluster", __func__); 569 if (pwrite(disk->fd, scratch, disk->clustersz, dst) == -1) 570 fatal("%s: could not write cluster", __func__); 571 free(scratch); 572} 573 574static void 575inc_refs(struct qcdisk *disk, off_t off, int newcluster) 576{ 577 off_t l1off, l1idx, l2idx, l2cluster; 578 size_t nper; 579 uint16_t refs; 580 uint64_t buf; 581 582 off &= ~QCOW2_INPLACE; 583 nper = disk->clustersz / 2; 584 l1idx = (off / disk->clustersz) / nper; 585 l2idx = (off / disk->clustersz) % nper; 586 l1off = disk->refoff + 8 * l1idx; 587 if (pread(disk->fd, &buf, sizeof(buf), l1off) != 8) 588 fatal("could not read refs"); 589 590 l2cluster = be64toh(buf); 591 if (l2cluster == 0) { 592 l2cluster = disk->end; 593 disk->end += disk->clustersz; 594 if (ftruncate(disk->fd, disk->end) < 0) 595 fatal("%s: failed to allocate ref block", __func__); 596 buf = htobe64(l2cluster); 597 if (pwrite(disk->fd, &buf, sizeof(buf), l1off) != 8) 598 fatal("%s: failed to write ref block", __func__); 599 } 600 601 refs = 1; 602 if (!newcluster) { 603 if (pread(disk->fd, &refs, sizeof(refs), 604 l2cluster + 2 * l2idx) != 2) 605 fatal("could not read ref cluster"); 606 refs = be16toh(refs) + 1; 607 } 608 refs = htobe16(refs); 609 if (pwrite(disk->fd, &refs, sizeof(refs), l2cluster + 2 * l2idx) != 2) 610 fatal("%s: could not write ref block", __func__); 611} 612 613/* 614 * virtio_qcow2_create 615 * 616 * Create an empty qcow2 imagefile with the specified path and size. 617 * 618 * Parameters: 619 * imgfile_path: path to the image file to create 620 * imgsize : size of the image file to create (in MB) 621 * 622 * Return: 623 * EEXIST: The requested image file already exists 624 * 0 : Image file successfully created 625 * Exxxx : Various other Exxxx errno codes due to other I/O errors 626 */ 627int 628virtio_qcow2_create(const char *imgfile_path, 629 const char *base_path, long imgsize) 630{ 631 struct qcheader { 632 char magic[4]; 633 uint32_t version; 634 uint64_t backingoff; 635 uint32_t backingsz; 636 uint32_t clustershift; 637 uint64_t disksz; 638 uint32_t cryptmethod; 639 uint32_t l1sz; 640 uint64_t l1off; 641 uint64_t refoff; 642 uint32_t refsz; 643 uint32_t snapcount; 644 uint64_t snapsz; 645 /* v3 additions */ 646 uint64_t incompatfeatures; 647 uint64_t compatfeatures; 648 uint64_t autoclearfeatures; 649 uint32_t reforder; 650 uint32_t headersz; 651 } __packed hdr, basehdr; 652 int fd, ret; 653 ssize_t base_len; 654 uint64_t l1sz, refsz, disksz, initsz, clustersz; 655 uint64_t l1off, refoff, v, i, l1entrysz, refentrysz; 656 uint16_t refs; 657 658 disksz = 1024 * 1024 * imgsize; 659 660 if (base_path) { 661 fd = open(base_path, O_RDONLY); 662 if (read(fd, &basehdr, sizeof(basehdr)) != sizeof(basehdr)) 663 err(1, "failure to read base image header"); 664 close(fd); 665 if (strncmp(basehdr.magic, 666 VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0) 667 errx(1, "base image is not a qcow2 file"); 668 if (!disksz) 669 disksz = betoh64(basehdr.disksz); 670 else if (disksz != betoh64(basehdr.disksz)) 671 errx(1, "base size does not match requested size"); 672 } 673 if (!base_path && !disksz) 674 errx(1, "missing disk size"); 675 676 clustersz = (1<<16); 677 l1off = ALIGNSZ(sizeof(hdr), clustersz); 678 679 l1entrysz = clustersz * clustersz / 8; 680 l1sz = (disksz + l1entrysz - 1) / l1entrysz; 681 682 refoff = ALIGNSZ(l1off + 8*l1sz, clustersz); 683 refentrysz = clustersz * clustersz * clustersz / 2; 684 refsz = (disksz + refentrysz - 1) / refentrysz; 685 686 initsz = ALIGNSZ(refoff + refsz*clustersz, clustersz); 687 base_len = base_path ? strlen(base_path) : 0; 688 689 memcpy(hdr.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)); 690 hdr.version = htobe32(3); 691 hdr.backingoff = htobe64(base_path ? sizeof(hdr) : 0); 692 hdr.backingsz = htobe32(base_len); 693 hdr.clustershift = htobe32(16); 694 hdr.disksz = htobe64(disksz); 695 hdr.cryptmethod = htobe32(0); 696 hdr.l1sz = htobe32(l1sz); 697 hdr.l1off = htobe64(l1off); 698 hdr.refoff = htobe64(refoff); 699 hdr.refsz = htobe32(refsz); 700 hdr.snapcount = htobe32(0); 701 hdr.snapsz = htobe64(0); 702 hdr.incompatfeatures = htobe64(0); 703 hdr.compatfeatures = htobe64(0); 704 hdr.autoclearfeatures = htobe64(0); 705 hdr.reforder = htobe32(4); 706 hdr.headersz = htobe32(sizeof(hdr)); 707 708 /* Refuse to overwrite an existing image */ 709 fd = open(imgfile_path, O_RDWR | O_CREAT | O_TRUNC | O_EXCL, 710 S_IRUSR | S_IWUSR); 711 if (fd == -1) 712 return (errno); 713 714 /* Write out the header */ 715 if (write(fd, &hdr, sizeof(hdr)) != sizeof(hdr)) 716 goto error; 717 718 /* Add the base image */ 719 if (base_path && write(fd, base_path, base_len) != base_len) 720 goto error; 721 722 /* Extend to desired size, and add one refcount cluster */ 723 if (ftruncate(fd, (off_t)initsz + clustersz) == -1) 724 goto error; 725 726 /* 727 * Paranoia: if our disk image takes more than one cluster 728 * to refcount the initial image, fail. 729 */ 730 if (initsz/clustersz > clustersz/2) { 731 errno = ERANGE; 732 goto error; 733 } 734 735 /* Add a refcount block, and refcount ourselves. */ 736 v = htobe64(initsz); 737 if (pwrite(fd, &v, 8, refoff) != 8) 738 goto error; 739 for (i = 0; i < initsz/clustersz + 1; i++) { 740 refs = htobe16(1); 741 if (pwrite(fd, &refs, 2, initsz + 2*i) != 2) 742 goto error; 743 } 744 745 ret = close(fd); 746 return (ret); 747error: 748 ret = errno; 749 close(fd); 750 unlink(imgfile_path); 751 return (errno); 752} 753