1 /* $OpenBSD: vioqcow2.c,v 1.25 2024/09/26 01:45:13 jsg Exp $ */ 2 3 /* 4 * Copyright (c) 2018 Ori Bernstein <ori@eigenstate.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/types.h> 20 #include <sys/stat.h> 21 22 #include <err.h> 23 #include <errno.h> 24 #include <fcntl.h> 25 #include <libgen.h> 26 #include <stdlib.h> 27 #include <string.h> 28 #include <unistd.h> 29 30 #include "virtio.h" 31 32 #define QCOW2_COMPRESSED 0x4000000000000000ull 33 #define QCOW2_INPLACE 0x8000000000000000ull 34 35 #define QCOW2_DIRTY (1 << 0) 36 #define QCOW2_CORRUPT (1 << 1) 37 38 enum { 39 ICFEATURE_DIRTY = 1 << 0, 40 ICFEATURE_CORRUPT = 1 << 1, 41 }; 42 43 enum { 44 ACFEATURE_BITEXT = 1 << 0, 45 }; 46 47 struct qcheader { 48 char magic[4]; 49 uint32_t version; 50 uint64_t backingoff; 51 uint32_t backingsz; 52 uint32_t clustershift; 53 uint64_t disksz; 54 uint32_t cryptmethod; 55 uint32_t l1sz; 56 uint64_t l1off; 57 uint64_t refoff; 58 uint32_t refsz; 59 uint32_t snapcount; 60 uint64_t snapsz; 61 /* v3 additions */ 62 uint64_t incompatfeatures; 63 uint64_t compatfeatures; 64 uint64_t autoclearfeatures; 65 uint32_t reforder; /* Bits = 1 << reforder */ 66 uint32_t headersz; 67 } __packed; 68 69 struct qcdisk { 70 pthread_rwlock_t lock; 71 struct qcdisk *base; 72 struct qcheader header; 73 74 int fd; 75 uint64_t *l1; 76 off_t end; 77 off_t clustersz; 78 off_t disksz; /* In bytes */ 79 uint32_t cryptmethod; 80 81 uint32_t l1sz; 82 off_t l1off; 83 84 off_t refoff; 85 off_t refsz; 86 87 uint32_t nsnap; 88 off_t snapoff; 89 90 /* v3 features */ 91 uint64_t incompatfeatures; 92 uint64_t autoclearfeatures; 93 uint32_t refssz; 94 uint32_t headersz; 95 }; 96 97 extern char *__progname; 98 99 static off_t xlate(struct qcdisk *, off_t, int *); 100 static void copy_cluster(struct qcdisk *, struct qcdisk *, off_t, off_t); 101 static void inc_refs(struct qcdisk *, off_t, int); 102 static off_t mkcluster(struct qcdisk *, struct qcdisk *, off_t, off_t); 103 static int qc2_open(struct qcdisk *, int *, size_t); 104 static ssize_t qc2_pread(void *, char *, size_t, off_t); 105 static ssize_t qc2_preadv(void *, struct iovec *, int, off_t); 106 static ssize_t qc2_pwrite(void *, char *, size_t, off_t); 107 static ssize_t qc2_pwritev(void *, struct iovec *, int, off_t); 108 static void qc2_close(void *, int); 109 110 /* 111 * Initializes a raw disk image backing file from an fd. Stores the 112 * number of bytes in *szp, returning -1 for error, 0 for success. 113 * 114 * May open snapshot base images. 115 */ 116 int 117 virtio_qcow2_init(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd) 118 { 119 struct qcdisk *diskp; 120 121 diskp = malloc(sizeof(struct qcdisk)); 122 if (diskp == NULL) 123 return -1; 124 if (qc2_open(diskp, fd, nfd) == -1) { 125 log_warnx("could not open qcow2 disk"); 126 return -1; 127 } 128 file->p = diskp; 129 file->pread = qc2_pread; 130 file->preadv = qc2_preadv; 131 file->pwrite = qc2_pwrite; 132 file->pwritev = qc2_pwritev; 133 file->close = qc2_close; 134 *szp = diskp->disksz; 135 return 0; 136 } 137 138 /* 139 * Return the path to the base image given a disk image. 140 * Called from vmctl. 141 */ 142 ssize_t 143 virtio_qcow2_get_base(int fd, char *path, size_t npath, const char *dpath) 144 { 145 char dpathbuf[PATH_MAX]; 146 char expanded[PATH_MAX]; 147 struct qcheader header; 148 uint64_t backingoff; 149 uint32_t backingsz; 150 char *s = NULL; 151 152 if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) { 153 log_warnx("short read on header"); 154 return -1; 155 } 156 if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0) { 157 log_warnx("invalid magic numbers"); 158 return -1; 159 } 160 backingoff = be64toh(header.backingoff); 161 backingsz = be32toh(header.backingsz); 162 if (backingsz == 0) 163 return 0; 164 165 if (backingsz >= npath - 1) { 166 log_warnx("snapshot path too long"); 167 return -1; 168 } 169 if (pread(fd, path, backingsz, backingoff) != backingsz) { 170 log_warnx("could not read snapshot base name"); 171 return -1; 172 } 173 path[backingsz] = '\0'; 174 175 /* 176 * Relative paths should be interpreted relative to the disk image, 177 * rather than relative to the directory vmd happens to be running in, 178 * since this is the only useful interpretation. 179 */ 180 if (path[0] == '/') { 181 if (realpath(path, expanded) == NULL || 182 strlcpy(path, expanded, npath) >= npath) { 183 log_warnx("unable to resolve %s", path); 184 return -1; 185 } 186 } else { 187 if (strlcpy(dpathbuf, dpath, sizeof(dpathbuf)) >= 188 sizeof(dpathbuf)) { 189 log_warnx("path too long: %s", dpath); 190 return -1; 191 } 192 s = dirname(dpathbuf); 193 if (snprintf(expanded, sizeof(expanded), 194 "%s/%s", s, path) >= (int)sizeof(expanded)) { 195 log_warnx("path too long: %s/%s", s, path); 196 return -1; 197 } 198 if (npath < PATH_MAX || 199 realpath(expanded, path) == NULL) { 200 log_warnx("unable to resolve %s", path); 201 return -1; 202 } 203 } 204 205 return strlen(path); 206 } 207 208 static int 209 qc2_open(struct qcdisk *disk, int *fds, size_t nfd) 210 { 211 char basepath[PATH_MAX]; 212 struct stat st; 213 struct qcheader header; 214 uint64_t backingoff; 215 uint32_t backingsz; 216 off_t i; 217 int version, fd; 218 219 pthread_rwlock_init(&disk->lock, NULL); 220 fd = fds[0]; 221 disk->fd = fd; 222 disk->base = NULL; 223 disk->l1 = NULL; 224 225 if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) 226 fatalx("short read on header"); 227 if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0) 228 fatalx("invalid magic numbers"); 229 230 disk->clustersz = (1ull << be32toh(header.clustershift)); 231 disk->disksz = be64toh(header.disksz); 232 disk->cryptmethod = be32toh(header.cryptmethod); 233 disk->l1sz = be32toh(header.l1sz); 234 disk->l1off = be64toh(header.l1off); 235 disk->refsz = be32toh(header.refsz); 236 disk->refoff = be64toh(header.refoff); 237 disk->nsnap = be32toh(header.snapcount); 238 disk->snapoff = be64toh(header.snapsz); 239 240 /* 241 * The additional features here are defined as 0 in the v2 format, 242 * so as long as we clear the buffer before parsing, we don't need 243 * to check versions here. 244 */ 245 disk->incompatfeatures = be64toh(header.incompatfeatures); 246 disk->autoclearfeatures = be64toh(header.autoclearfeatures); 247 disk->refssz = be32toh(header.refsz); 248 disk->headersz = be32toh(header.headersz); 249 250 /* 251 * We only know about the dirty or corrupt bits here. 252 */ 253 if (disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT)) 254 fatalx("unsupported features %llx", 255 disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT)); 256 if (be32toh(header.reforder) != 4) 257 fatalx("unsupported refcount size\n"); 258 259 disk->l1 = calloc(disk->l1sz, sizeof(*disk->l1)); 260 if (!disk->l1) 261 fatal("%s: could not allocate l1 table", __func__); 262 if (pread(disk->fd, disk->l1, 8 * disk->l1sz, disk->l1off) 263 != 8 * disk->l1sz) 264 fatalx("%s: unable to read qcow2 L1 table", __func__); 265 for (i = 0; i < disk->l1sz; i++) 266 disk->l1[i] = be64toh(disk->l1[i]); 267 version = be32toh(header.version); 268 if (version != 2 && version != 3) 269 fatalx("%s: unknown qcow2 version %d", __func__, version); 270 271 backingoff = be64toh(header.backingoff); 272 backingsz = be32toh(header.backingsz); 273 if (backingsz != 0) { 274 if (backingsz >= sizeof(basepath) - 1) { 275 fatalx("%s: snapshot path too long", __func__); 276 } 277 if (pread(fd, basepath, backingsz, backingoff) != backingsz) { 278 fatalx("%s: could not read snapshot base name", 279 __func__); 280 } 281 basepath[backingsz] = 0; 282 if (nfd <= 1) { 283 fatalx("%s: missing base image %s", __func__, 284 basepath); 285 } 286 287 288 disk->base = calloc(1, sizeof(struct qcdisk)); 289 if (!disk->base) 290 fatal("%s: could not open %s", __func__, basepath); 291 if (qc2_open(disk->base, fds + 1, nfd - 1) == -1) 292 fatalx("%s: could not open %s", __func__, basepath); 293 if (disk->base->clustersz != disk->clustersz) 294 fatalx("%s: all disk parts must share clustersize", 295 __func__); 296 } 297 if (fstat(fd, &st) == -1) 298 fatal("%s: unable to stat disk", __func__); 299 300 disk->end = st.st_size; 301 302 log_debug("%s: qcow2 disk version %d size %lld end %lld snap %d", 303 __func__, version, disk->disksz, disk->end, disk->nsnap); 304 305 return 0; 306 } 307 308 static ssize_t 309 qc2_preadv(void *p, struct iovec *iov, int cnt, off_t offset) 310 { 311 int i; 312 off_t pos = offset; 313 ssize_t sz = 0, total = 0; 314 315 for (i = 0; i < cnt; i++, iov++) { 316 sz = qc2_pread(p, iov->iov_base, iov->iov_len, pos); 317 if (sz == -1) 318 return (sz); 319 total += sz; 320 pos += sz; 321 } 322 323 return (total); 324 } 325 326 static ssize_t 327 qc2_pread(void *p, char *buf, size_t len, off_t off) 328 { 329 struct qcdisk *disk, *d; 330 off_t phys_off, end, cluster_off; 331 ssize_t sz, rem; 332 333 disk = p; 334 end = off + len; 335 if (off < 0 || end > disk->disksz) 336 return -1; 337 338 /* handle head chunk separately */ 339 rem = len; 340 while (off != end) { 341 for (d = disk; d; d = d->base) 342 if ((phys_off = xlate(d, off, NULL)) > 0) 343 break; 344 /* Break out into chunks. This handles 345 * three cases: 346 * 347 * |----+====|========|====+-----| 348 * 349 * Either we are at the start of the read, 350 * and the cluster has some leading bytes. 351 * This means that we are reading the tail 352 * of the cluster, and our size is: 353 * 354 * clustersz - (off % clustersz). 355 * 356 * Otherwise, we're reading the middle section. 357 * We're already aligned here, so we can just 358 * read the whole cluster size. Or we're at the 359 * tail, at which point we just want to read the 360 * remaining bytes. 361 */ 362 cluster_off = off % disk->clustersz; 363 sz = disk->clustersz - cluster_off; 364 if (sz > rem) 365 sz = rem; 366 /* 367 * If we're within the disk, but don't have backing bytes, 368 * just read back zeros. 369 */ 370 if (!d) 371 bzero(buf, sz); 372 else if (pread(d->fd, buf, sz, phys_off) != sz) 373 return -1; 374 off += sz; 375 buf += sz; 376 rem -= sz; 377 } 378 return len; 379 } 380 381 static ssize_t 382 qc2_pwritev(void *p, struct iovec *iov, int cnt, off_t offset) 383 { 384 int i; 385 off_t pos = offset; 386 ssize_t sz = 0, total = 0; 387 388 for (i = 0; i < cnt; i++, iov++) { 389 sz = qc2_pwrite(p, iov->iov_base, iov->iov_len, pos); 390 if (sz == -1) 391 return (sz); 392 total += sz; 393 pos += sz; 394 } 395 396 return (total); 397 } 398 399 static ssize_t 400 qc2_pwrite(void *p, char *buf, size_t len, off_t off) 401 { 402 struct qcdisk *disk, *d; 403 off_t phys_off, cluster_off, end; 404 ssize_t sz, rem; 405 int inplace; 406 407 d = p; 408 disk = p; 409 inplace = 1; 410 end = off + len; 411 if (off < 0 || end > disk->disksz) 412 return -1; 413 rem = len; 414 while (off != end) { 415 /* See the read code for a summary of the computation */ 416 cluster_off = off % disk->clustersz; 417 sz = disk->clustersz - cluster_off; 418 if (sz > rem) 419 sz = rem; 420 421 phys_off = xlate(disk, off, &inplace); 422 if (phys_off == -1) 423 return -1; 424 /* 425 * If we couldn't find the cluster in the writable disk, 426 * see if it exists in the base image. If it does, we 427 * need to copy it before the write. The copy happens 428 * in the '!inplace' if clause below te search. 429 */ 430 if (phys_off == 0) 431 for (d = disk->base; d; d = d->base) 432 if ((phys_off = xlate(d, off, NULL)) > 0) 433 break; 434 if (!inplace || phys_off == 0) 435 phys_off = mkcluster(disk, d, off, phys_off); 436 if (phys_off == -1) 437 return -1; 438 if (phys_off < disk->clustersz) 439 fatalx("%s: writing reserved cluster", __func__); 440 if (pwrite(disk->fd, buf, sz, phys_off) != sz) 441 return -1; 442 off += sz; 443 buf += sz; 444 rem -= sz; 445 } 446 return len; 447 } 448 449 static void 450 qc2_close(void *p, int stayopen) 451 { 452 struct qcdisk *disk; 453 454 disk = p; 455 if (disk->base) 456 qc2_close(disk->base, stayopen); 457 if (!stayopen) 458 close(disk->fd); 459 free(disk->l1); 460 free(disk); 461 } 462 463 /* 464 * Translates a virtual offset into an on-disk offset. 465 * Returns: 466 * -1 on error 467 * 0 on 'not found' 468 * >0 on found 469 */ 470 static off_t 471 xlate(struct qcdisk *disk, off_t off, int *inplace) 472 { 473 off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff; 474 uint64_t buf; 475 476 477 /* 478 * Clear out inplace flag -- xlate misses should not 479 * be flagged as updatable in place. We will still 480 * return 0 from them, but this leaves less surprises 481 * in the API. 482 */ 483 if (inplace) 484 *inplace = 0; 485 pthread_rwlock_rdlock(&disk->lock); 486 if (off < 0) 487 goto err; 488 489 l2sz = disk->clustersz / 8; 490 l1off = (off / disk->clustersz) / l2sz; 491 if (l1off >= disk->l1sz) 492 goto err; 493 494 l2tab = disk->l1[l1off]; 495 l2tab &= ~QCOW2_INPLACE; 496 if (l2tab == 0) { 497 pthread_rwlock_unlock(&disk->lock); 498 return 0; 499 } 500 l2off = (off / disk->clustersz) % l2sz; 501 pread(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8); 502 cluster = be64toh(buf); 503 /* 504 * cluster may be 0, but all future operations don't affect 505 * the return value. 506 */ 507 if (inplace) 508 *inplace = !!(cluster & QCOW2_INPLACE); 509 if (cluster & QCOW2_COMPRESSED) 510 fatalx("%s: compressed clusters unsupported", __func__); 511 pthread_rwlock_unlock(&disk->lock); 512 clusteroff = 0; 513 cluster &= ~QCOW2_INPLACE; 514 if (cluster) 515 clusteroff = off % disk->clustersz; 516 return cluster + clusteroff; 517 err: 518 pthread_rwlock_unlock(&disk->lock); 519 return -1; 520 } 521 522 /* 523 * Allocates a new cluster on disk, creating a new L2 table 524 * if needed. The cluster starts off with a refs of one, 525 * and the writable bit set. 526 * 527 * Returns -1 on error, and the physical address within the 528 * cluster of the write offset if it exists. 529 */ 530 static off_t 531 mkcluster(struct qcdisk *disk, struct qcdisk *base, off_t off, off_t src_phys) 532 { 533 off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff, orig; 534 uint64_t buf; 535 536 pthread_rwlock_wrlock(&disk->lock); 537 538 cluster = -1; 539 /* L1 entries always exist */ 540 l2sz = disk->clustersz / 8; 541 l1off = off / (disk->clustersz * l2sz); 542 if (l1off >= disk->l1sz) 543 fatalx("l1 offset outside disk"); 544 545 disk->end = (disk->end + disk->clustersz - 1) & ~(disk->clustersz - 1); 546 547 l2tab = disk->l1[l1off]; 548 l2off = (off / disk->clustersz) % l2sz; 549 /* We may need to create or clone an L2 entry to map the block */ 550 if (l2tab == 0 || (l2tab & QCOW2_INPLACE) == 0) { 551 orig = l2tab & ~QCOW2_INPLACE; 552 l2tab = disk->end; 553 disk->end += disk->clustersz; 554 if (ftruncate(disk->fd, disk->end) == -1) 555 fatal("%s: ftruncate failed", __func__); 556 557 /* 558 * If we translated, found a L2 entry, but it needed to 559 * be copied, copy it. 560 */ 561 if (orig != 0) 562 copy_cluster(disk, disk, l2tab, orig); 563 /* Update l1 -- we flush it later */ 564 disk->l1[l1off] = l2tab | QCOW2_INPLACE; 565 inc_refs(disk, l2tab, 1); 566 } 567 l2tab &= ~QCOW2_INPLACE; 568 569 /* Grow the disk */ 570 if (ftruncate(disk->fd, disk->end + disk->clustersz) < 0) 571 fatal("%s: could not grow disk", __func__); 572 if (src_phys > 0) 573 copy_cluster(disk, base, disk->end, src_phys); 574 cluster = disk->end; 575 disk->end += disk->clustersz; 576 buf = htobe64(cluster | QCOW2_INPLACE); 577 if (pwrite(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8) != 8) 578 fatalx("%s: could not write cluster", __func__); 579 580 /* TODO: lazily sync: currently VMD doesn't close things */ 581 buf = htobe64(disk->l1[l1off]); 582 if (pwrite(disk->fd, &buf, sizeof(buf), disk->l1off + 8 * l1off) != 8) 583 fatalx("%s: could not write l1", __func__); 584 inc_refs(disk, cluster, 1); 585 586 pthread_rwlock_unlock(&disk->lock); 587 clusteroff = off % disk->clustersz; 588 if (cluster + clusteroff < disk->clustersz) 589 fatalx("write would clobber header"); 590 return cluster + clusteroff; 591 } 592 593 /* Copies a cluster containing src to dst. Src and dst need not be aligned. */ 594 static void 595 copy_cluster(struct qcdisk *disk, struct qcdisk *base, off_t dst, off_t src) 596 { 597 char *scratch; 598 599 scratch = malloc(disk->clustersz); 600 if (!scratch) 601 fatal("out of memory"); 602 src &= ~(disk->clustersz - 1); 603 dst &= ~(disk->clustersz - 1); 604 if (pread(base->fd, scratch, disk->clustersz, src) == -1) 605 fatal("%s: could not read cluster", __func__); 606 if (pwrite(disk->fd, scratch, disk->clustersz, dst) == -1) 607 fatal("%s: could not write cluster", __func__); 608 free(scratch); 609 } 610 611 static void 612 inc_refs(struct qcdisk *disk, off_t off, int newcluster) 613 { 614 off_t l1off, l1idx, l2idx, l2cluster; 615 size_t nper; 616 uint16_t refs; 617 uint64_t buf; 618 619 off &= ~QCOW2_INPLACE; 620 nper = disk->clustersz / 2; 621 l1idx = (off / disk->clustersz) / nper; 622 l2idx = (off / disk->clustersz) % nper; 623 l1off = disk->refoff + 8 * l1idx; 624 if (pread(disk->fd, &buf, sizeof(buf), l1off) != 8) 625 fatal("could not read refs"); 626 627 l2cluster = be64toh(buf); 628 if (l2cluster == 0) { 629 l2cluster = disk->end; 630 disk->end += disk->clustersz; 631 if (ftruncate(disk->fd, disk->end) < 0) 632 fatal("%s: failed to allocate ref block", __func__); 633 buf = htobe64(l2cluster); 634 if (pwrite(disk->fd, &buf, sizeof(buf), l1off) != 8) 635 fatal("%s: failed to write ref block", __func__); 636 } 637 638 refs = 1; 639 if (!newcluster) { 640 if (pread(disk->fd, &refs, sizeof(refs), 641 l2cluster + 2 * l2idx) != 2) 642 fatal("could not read ref cluster"); 643 refs = be16toh(refs) + 1; 644 } 645 refs = htobe16(refs); 646 if (pwrite(disk->fd, &refs, sizeof(refs), l2cluster + 2 * l2idx) != 2) 647 fatal("%s: could not write ref block", __func__); 648 } 649 650 /* 651 * virtio_qcow2_create 652 * 653 * Create an empty qcow2 imagefile with the specified path and size. 654 * 655 * Parameters: 656 * imgfile_path: path to the image file to create 657 * imgsize : size of the image file to create (in bytes) 658 * 659 * Return: 660 * EEXIST: The requested image file already exists 661 * 0 : Image file successfully created 662 * Exxxx : Various other Exxxx errno codes due to other I/O errors 663 */ 664 int 665 virtio_qcow2_create(const char *imgfile_path, 666 const char *base_path, uint64_t disksz) 667 { 668 struct qcheader hdr, basehdr; 669 int fd, ret; 670 ssize_t base_len; 671 uint64_t l1sz, refsz, initsz, clustersz; 672 uint64_t l1off, refoff, v, i, l1entrysz, refentrysz; 673 uint16_t refs; 674 675 if (base_path) { 676 fd = open(base_path, O_RDONLY); 677 if (read(fd, &basehdr, sizeof(basehdr)) != sizeof(basehdr)) 678 errx(1, "failure to read base image header"); 679 close(fd); 680 if (strncmp(basehdr.magic, 681 VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0) 682 errx(1, "base image is not a qcow2 file"); 683 if (!disksz) 684 disksz = betoh64(basehdr.disksz); 685 else if (disksz != betoh64(basehdr.disksz)) 686 errx(1, "base size does not match requested size"); 687 } 688 if (!base_path && !disksz) 689 errx(1, "missing disk size"); 690 691 clustersz = (1<<16); 692 l1off = ALIGNSZ(sizeof(hdr), clustersz); 693 694 l1entrysz = clustersz * clustersz / 8; 695 l1sz = (disksz + l1entrysz - 1) / l1entrysz; 696 697 refoff = ALIGNSZ(l1off + 8*l1sz, clustersz); 698 refentrysz = clustersz * clustersz * clustersz / 2; 699 refsz = (disksz + refentrysz - 1) / refentrysz; 700 701 initsz = ALIGNSZ(refoff + refsz*clustersz, clustersz); 702 base_len = base_path ? strlen(base_path) : 0; 703 704 memcpy(hdr.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)); 705 hdr.version = htobe32(3); 706 hdr.backingoff = htobe64(base_path ? sizeof(hdr) : 0); 707 hdr.backingsz = htobe32(base_len); 708 hdr.clustershift = htobe32(16); 709 hdr.disksz = htobe64(disksz); 710 hdr.cryptmethod = htobe32(0); 711 hdr.l1sz = htobe32(l1sz); 712 hdr.l1off = htobe64(l1off); 713 hdr.refoff = htobe64(refoff); 714 hdr.refsz = htobe32(refsz); 715 hdr.snapcount = htobe32(0); 716 hdr.snapsz = htobe64(0); 717 hdr.incompatfeatures = htobe64(0); 718 hdr.compatfeatures = htobe64(0); 719 hdr.autoclearfeatures = htobe64(0); 720 hdr.reforder = htobe32(4); 721 hdr.headersz = htobe32(sizeof(hdr)); 722 723 /* Refuse to overwrite an existing image */ 724 fd = open(imgfile_path, O_RDWR | O_CREAT | O_TRUNC | O_EXCL, 725 S_IRUSR | S_IWUSR); 726 if (fd == -1) 727 return (errno); 728 729 /* Write out the header */ 730 if (write(fd, &hdr, sizeof(hdr)) != sizeof(hdr)) 731 goto error; 732 733 /* Add the base image */ 734 if (base_path && write(fd, base_path, base_len) != base_len) 735 goto error; 736 737 /* Extend to desired size, and add one refcount cluster */ 738 if (ftruncate(fd, (off_t)initsz + clustersz) == -1) 739 goto error; 740 741 /* 742 * Paranoia: if our disk image takes more than one cluster 743 * to refcount the initial image, fail. 744 */ 745 if (initsz/clustersz > clustersz/2) { 746 errno = ERANGE; 747 goto error; 748 } 749 750 /* Add a refcount block, and refcount ourselves. */ 751 v = htobe64(initsz); 752 if (pwrite(fd, &v, 8, refoff) != 8) 753 goto error; 754 for (i = 0; i < initsz/clustersz + 1; i++) { 755 refs = htobe16(1); 756 if (pwrite(fd, &refs, 2, initsz + 2*i) != 2) 757 goto error; 758 } 759 760 ret = close(fd); 761 return (ret); 762 error: 763 ret = errno; 764 close(fd); 765 unlink(imgfile_path); 766 return (errno); 767 } 768