1*65bbee46Sjsg /* $OpenBSD: vioqcow2.c,v 1.25 2024/09/26 01:45:13 jsg Exp $ */ 2f224f92aSccardenas 3f224f92aSccardenas /* 4f224f92aSccardenas * Copyright (c) 2018 Ori Bernstein <ori@eigenstate.org> 5f224f92aSccardenas * 6f224f92aSccardenas * Permission to use, copy, modify, and distribute this software for any 7f224f92aSccardenas * purpose with or without fee is hereby granted, provided that the above 8f224f92aSccardenas * copyright notice and this permission notice appear in all copies. 9f224f92aSccardenas * 10f224f92aSccardenas * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11f224f92aSccardenas * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12f224f92aSccardenas * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13f224f92aSccardenas * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14f224f92aSccardenas * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15f224f92aSccardenas * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16f224f92aSccardenas * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17f224f92aSccardenas */ 18f224f92aSccardenas 19f224f92aSccardenas #include <sys/types.h> 20f224f92aSccardenas #include <sys/stat.h> 21f224f92aSccardenas 226eb4c859Sdv #include <err.h> 236eb4c859Sdv #include <errno.h> 246eb4c859Sdv #include <fcntl.h> 256eb4c859Sdv #include <libgen.h> 26f224f92aSccardenas #include <stdlib.h> 27f224f92aSccardenas #include <string.h> 28f224f92aSccardenas #include <unistd.h> 29f224f92aSccardenas 30f224f92aSccardenas #include "virtio.h" 31f224f92aSccardenas 32f224f92aSccardenas #define QCOW2_COMPRESSED 0x4000000000000000ull 33f224f92aSccardenas #define QCOW2_INPLACE 0x8000000000000000ull 34f224f92aSccardenas 35f224f92aSccardenas #define QCOW2_DIRTY (1 << 0) 36f224f92aSccardenas #define QCOW2_CORRUPT (1 << 1) 37f224f92aSccardenas 38f224f92aSccardenas enum { 39f224f92aSccardenas ICFEATURE_DIRTY = 1 << 0, 40f224f92aSccardenas ICFEATURE_CORRUPT = 1 << 1, 41f224f92aSccardenas }; 42f224f92aSccardenas 43f224f92aSccardenas enum { 44f224f92aSccardenas ACFEATURE_BITEXT = 1 << 0, 45f224f92aSccardenas }; 46f224f92aSccardenas 47f224f92aSccardenas struct qcheader { 48f224f92aSccardenas char magic[4]; 49f224f92aSccardenas uint32_t version; 50f224f92aSccardenas uint64_t backingoff; 51f224f92aSccardenas uint32_t backingsz; 52f224f92aSccardenas uint32_t clustershift; 53f224f92aSccardenas uint64_t disksz; 54f224f92aSccardenas uint32_t cryptmethod; 55f224f92aSccardenas uint32_t l1sz; 56f224f92aSccardenas uint64_t l1off; 57f224f92aSccardenas uint64_t refoff; 58f224f92aSccardenas uint32_t refsz; 59f224f92aSccardenas uint32_t snapcount; 60f224f92aSccardenas uint64_t snapsz; 61f224f92aSccardenas /* v3 additions */ 62f224f92aSccardenas uint64_t incompatfeatures; 63f224f92aSccardenas uint64_t compatfeatures; 64f224f92aSccardenas uint64_t autoclearfeatures; 65f224f92aSccardenas uint32_t reforder; /* Bits = 1 << reforder */ 66f224f92aSccardenas uint32_t headersz; 67f224f92aSccardenas } __packed; 68f224f92aSccardenas 69f224f92aSccardenas struct qcdisk { 70f224f92aSccardenas pthread_rwlock_t lock; 71f224f92aSccardenas struct qcdisk *base; 72f224f92aSccardenas struct qcheader header; 73f224f92aSccardenas 74f224f92aSccardenas int fd; 75f224f92aSccardenas uint64_t *l1; 76f224f92aSccardenas off_t end; 77ed9943e8Sori off_t clustersz; 78f224f92aSccardenas off_t disksz; /* In bytes */ 79f224f92aSccardenas uint32_t cryptmethod; 80f224f92aSccardenas 81f224f92aSccardenas uint32_t l1sz; 82f224f92aSccardenas off_t l1off; 83f224f92aSccardenas 84f224f92aSccardenas off_t refoff; 85ed9943e8Sori off_t refsz; 86f224f92aSccardenas 87f224f92aSccardenas uint32_t nsnap; 88f224f92aSccardenas off_t snapoff; 89f224f92aSccardenas 90f224f92aSccardenas /* v3 features */ 91f224f92aSccardenas uint64_t incompatfeatures; 92f224f92aSccardenas uint64_t autoclearfeatures; 93f224f92aSccardenas uint32_t refssz; 94f224f92aSccardenas uint32_t headersz; 95f224f92aSccardenas }; 96f224f92aSccardenas 97f224f92aSccardenas extern char *__progname; 98f224f92aSccardenas 99f224f92aSccardenas static off_t xlate(struct qcdisk *, off_t, int *); 10007e1a8caSori static void copy_cluster(struct qcdisk *, struct qcdisk *, off_t, off_t); 10107e1a8caSori static void inc_refs(struct qcdisk *, off_t, int); 102f224f92aSccardenas static off_t mkcluster(struct qcdisk *, struct qcdisk *, off_t, off_t); 10373613953Sreyk static int qc2_open(struct qcdisk *, int *, size_t); 104f224f92aSccardenas static ssize_t qc2_pread(void *, char *, size_t, off_t); 10520e554f8Sdv static ssize_t qc2_preadv(void *, struct iovec *, int, off_t); 106f224f92aSccardenas static ssize_t qc2_pwrite(void *, char *, size_t, off_t); 10720e554f8Sdv static ssize_t qc2_pwritev(void *, struct iovec *, int, off_t); 108f6c09be3Sreyk static void qc2_close(void *, int); 109f224f92aSccardenas 110f224f92aSccardenas /* 1113481ecdfSdv * Initializes a raw disk image backing file from an fd. Stores the 1123481ecdfSdv * number of bytes in *szp, returning -1 for error, 0 for success. 113f224f92aSccardenas * 114f224f92aSccardenas * May open snapshot base images. 115f224f92aSccardenas */ 116f224f92aSccardenas int 11762df93eeSreyk virtio_qcow2_init(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd) 118f224f92aSccardenas { 119f224f92aSccardenas struct qcdisk *diskp; 120f224f92aSccardenas 121f224f92aSccardenas diskp = malloc(sizeof(struct qcdisk)); 122f224f92aSccardenas if (diskp == NULL) 123f224f92aSccardenas return -1; 12473613953Sreyk if (qc2_open(diskp, fd, nfd) == -1) { 12507e1a8caSori log_warnx("could not open qcow2 disk"); 126f224f92aSccardenas return -1; 127f224f92aSccardenas } 128f224f92aSccardenas file->p = diskp; 129f224f92aSccardenas file->pread = qc2_pread; 13020e554f8Sdv file->preadv = qc2_preadv; 131f224f92aSccardenas file->pwrite = qc2_pwrite; 13220e554f8Sdv file->pwritev = qc2_pwritev; 133f224f92aSccardenas file->close = qc2_close; 134a3500374Sasou *szp = diskp->disksz; 135f224f92aSccardenas return 0; 136f224f92aSccardenas } 137f224f92aSccardenas 13807e1a8caSori /* 13907e1a8caSori * Return the path to the base image given a disk image. 14007e1a8caSori * Called from vmctl. 14107e1a8caSori */ 14273613953Sreyk ssize_t 1434d2a1fb2Sreyk virtio_qcow2_get_base(int fd, char *path, size_t npath, const char *dpath) 144f224f92aSccardenas { 1450d8d8a26Snaddy char dpathbuf[PATH_MAX]; 1464d2a1fb2Sreyk char expanded[PATH_MAX]; 14773613953Sreyk struct qcheader header; 14873613953Sreyk uint64_t backingoff; 14973613953Sreyk uint32_t backingsz; 1504d2a1fb2Sreyk char *s = NULL; 151f224f92aSccardenas 15273613953Sreyk if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) { 15307e1a8caSori log_warnx("short read on header"); 154f224f92aSccardenas return -1; 15573613953Sreyk } 15673613953Sreyk if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0) { 15707e1a8caSori log_warnx("invalid magic numbers"); 15873613953Sreyk return -1; 15973613953Sreyk } 16073613953Sreyk backingoff = be64toh(header.backingoff); 16173613953Sreyk backingsz = be32toh(header.backingsz); 1624d2a1fb2Sreyk if (backingsz == 0) 1634d2a1fb2Sreyk return 0; 1644d2a1fb2Sreyk 16573613953Sreyk if (backingsz >= npath - 1) { 16607e1a8caSori log_warnx("snapshot path too long"); 16773613953Sreyk return -1; 16873613953Sreyk } 16973613953Sreyk if (pread(fd, path, backingsz, backingoff) != backingsz) { 17007e1a8caSori log_warnx("could not read snapshot base name"); 17173613953Sreyk return -1; 17273613953Sreyk } 17373613953Sreyk path[backingsz] = '\0'; 1744d2a1fb2Sreyk 1754d2a1fb2Sreyk /* 1764d2a1fb2Sreyk * Relative paths should be interpreted relative to the disk image, 1774d2a1fb2Sreyk * rather than relative to the directory vmd happens to be running in, 1782eec0843Sdv * since this is the only useful interpretation. 1794d2a1fb2Sreyk */ 1804d2a1fb2Sreyk if (path[0] == '/') { 1814d2a1fb2Sreyk if (realpath(path, expanded) == NULL || 1824d2a1fb2Sreyk strlcpy(path, expanded, npath) >= npath) { 18307e1a8caSori log_warnx("unable to resolve %s", path); 1844d2a1fb2Sreyk return -1; 18573613953Sreyk } 1864d2a1fb2Sreyk } else { 1870d8d8a26Snaddy if (strlcpy(dpathbuf, dpath, sizeof(dpathbuf)) >= 1880d8d8a26Snaddy sizeof(dpathbuf)) { 1890d8d8a26Snaddy log_warnx("path too long: %s", dpath); 1900d8d8a26Snaddy return -1; 1910d8d8a26Snaddy } 1920d8d8a26Snaddy s = dirname(dpathbuf); 1934d2a1fb2Sreyk if (snprintf(expanded, sizeof(expanded), 1944d2a1fb2Sreyk "%s/%s", s, path) >= (int)sizeof(expanded)) { 19507e1a8caSori log_warnx("path too long: %s/%s", s, path); 1964d2a1fb2Sreyk return -1; 1974d2a1fb2Sreyk } 1984d2a1fb2Sreyk if (npath < PATH_MAX || 1994d2a1fb2Sreyk realpath(expanded, path) == NULL) { 20007e1a8caSori log_warnx("unable to resolve %s", path); 2014d2a1fb2Sreyk return -1; 2024d2a1fb2Sreyk } 2034d2a1fb2Sreyk } 2044d2a1fb2Sreyk 2054d2a1fb2Sreyk return strlen(path); 206f224f92aSccardenas } 207f224f92aSccardenas 208f224f92aSccardenas static int 20973613953Sreyk qc2_open(struct qcdisk *disk, int *fds, size_t nfd) 210f224f92aSccardenas { 211f224f92aSccardenas char basepath[PATH_MAX]; 212f224f92aSccardenas struct stat st; 213f224f92aSccardenas struct qcheader header; 214f224f92aSccardenas uint64_t backingoff; 215f224f92aSccardenas uint32_t backingsz; 216ed9943e8Sori off_t i; 21773613953Sreyk int version, fd; 218f224f92aSccardenas 219f224f92aSccardenas pthread_rwlock_init(&disk->lock, NULL); 22073613953Sreyk fd = fds[0]; 221f224f92aSccardenas disk->fd = fd; 222f224f92aSccardenas disk->base = NULL; 22350bebf2cSccardenas disk->l1 = NULL; 22450bebf2cSccardenas 22507e1a8caSori if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) 22607e1a8caSori fatalx("short read on header"); 22707e1a8caSori if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0) 22807e1a8caSori fatalx("invalid magic numbers"); 229f224f92aSccardenas 230f224f92aSccardenas disk->clustersz = (1ull << be32toh(header.clustershift)); 231f224f92aSccardenas disk->disksz = be64toh(header.disksz); 232f224f92aSccardenas disk->cryptmethod = be32toh(header.cryptmethod); 233f224f92aSccardenas disk->l1sz = be32toh(header.l1sz); 234f224f92aSccardenas disk->l1off = be64toh(header.l1off); 235f224f92aSccardenas disk->refsz = be32toh(header.refsz); 236f224f92aSccardenas disk->refoff = be64toh(header.refoff); 237f224f92aSccardenas disk->nsnap = be32toh(header.snapcount); 238f224f92aSccardenas disk->snapoff = be64toh(header.snapsz); 23950bebf2cSccardenas 240f224f92aSccardenas /* 241f224f92aSccardenas * The additional features here are defined as 0 in the v2 format, 242f224f92aSccardenas * so as long as we clear the buffer before parsing, we don't need 243f224f92aSccardenas * to check versions here. 244f224f92aSccardenas */ 245f224f92aSccardenas disk->incompatfeatures = be64toh(header.incompatfeatures); 246f224f92aSccardenas disk->autoclearfeatures = be64toh(header.autoclearfeatures); 247f224f92aSccardenas disk->refssz = be32toh(header.refsz); 248f224f92aSccardenas disk->headersz = be32toh(header.headersz); 249f224f92aSccardenas 250f224f92aSccardenas /* 251f224f92aSccardenas * We only know about the dirty or corrupt bits here. 252f224f92aSccardenas */ 25307e1a8caSori if (disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT)) 25407e1a8caSori fatalx("unsupported features %llx", 255f224f92aSccardenas disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT)); 25607e1a8caSori if (be32toh(header.reforder) != 4) 25707e1a8caSori fatalx("unsupported refcount size\n"); 258f224f92aSccardenas 25973613953Sreyk disk->l1 = calloc(disk->l1sz, sizeof(*disk->l1)); 26050bebf2cSccardenas if (!disk->l1) 26107e1a8caSori fatal("%s: could not allocate l1 table", __func__); 26250bebf2cSccardenas if (pread(disk->fd, disk->l1, 8 * disk->l1sz, disk->l1off) 26307e1a8caSori != 8 * disk->l1sz) 26407e1a8caSori fatalx("%s: unable to read qcow2 L1 table", __func__); 265f224f92aSccardenas for (i = 0; i < disk->l1sz; i++) 266f224f92aSccardenas disk->l1[i] = be64toh(disk->l1[i]); 267f224f92aSccardenas version = be32toh(header.version); 26807e1a8caSori if (version != 2 && version != 3) 26907e1a8caSori fatalx("%s: unknown qcow2 version %d", __func__, version); 270f224f92aSccardenas 271f224f92aSccardenas backingoff = be64toh(header.backingoff); 272f224f92aSccardenas backingsz = be32toh(header.backingsz); 273f224f92aSccardenas if (backingsz != 0) { 27473613953Sreyk if (backingsz >= sizeof(basepath) - 1) { 27507e1a8caSori fatalx("%s: snapshot path too long", __func__); 276f224f92aSccardenas } 277f224f92aSccardenas if (pread(fd, basepath, backingsz, backingoff) != backingsz) { 27807e1a8caSori fatalx("%s: could not read snapshot base name", 2792919bad8Sccardenas __func__); 280f224f92aSccardenas } 281f224f92aSccardenas basepath[backingsz] = 0; 28273613953Sreyk if (nfd <= 1) { 28307e1a8caSori fatalx("%s: missing base image %s", __func__, 28473613953Sreyk basepath); 28573613953Sreyk } 28673613953Sreyk 287f224f92aSccardenas 288f224f92aSccardenas disk->base = calloc(1, sizeof(struct qcdisk)); 28950bebf2cSccardenas if (!disk->base) 29007e1a8caSori fatal("%s: could not open %s", __func__, basepath); 29107e1a8caSori if (qc2_open(disk->base, fds + 1, nfd - 1) == -1) 29207e1a8caSori fatalx("%s: could not open %s", __func__, basepath); 29307e1a8caSori if (disk->base->clustersz != disk->clustersz) 29407e1a8caSori fatalx("%s: all disk parts must share clustersize", 2952919bad8Sccardenas __func__); 296f224f92aSccardenas } 29707e1a8caSori if (fstat(fd, &st) == -1) 29807e1a8caSori fatal("%s: unable to stat disk", __func__); 29950bebf2cSccardenas 300f224f92aSccardenas disk->end = st.st_size; 30150bebf2cSccardenas 302e2d3e60dSreyk log_debug("%s: qcow2 disk version %d size %lld end %lld snap %d", 30307e1a8caSori __func__, version, disk->disksz, disk->end, disk->nsnap); 304e2d3e60dSreyk 305f224f92aSccardenas return 0; 306f224f92aSccardenas } 307f224f92aSccardenas 308f224f92aSccardenas static ssize_t 30920e554f8Sdv qc2_preadv(void *p, struct iovec *iov, int cnt, off_t offset) 31020e554f8Sdv { 31120e554f8Sdv int i; 31220e554f8Sdv off_t pos = offset; 31320e554f8Sdv ssize_t sz = 0, total = 0; 31420e554f8Sdv 31520e554f8Sdv for (i = 0; i < cnt; i++, iov++) { 31620e554f8Sdv sz = qc2_pread(p, iov->iov_base, iov->iov_len, pos); 31720e554f8Sdv if (sz == -1) 31820e554f8Sdv return (sz); 31920e554f8Sdv total += sz; 32020e554f8Sdv pos += sz; 32120e554f8Sdv } 32220e554f8Sdv 32320e554f8Sdv return (total); 32420e554f8Sdv } 32520e554f8Sdv 32620e554f8Sdv static ssize_t 327f224f92aSccardenas qc2_pread(void *p, char *buf, size_t len, off_t off) 328f224f92aSccardenas { 329f224f92aSccardenas struct qcdisk *disk, *d; 330f224f92aSccardenas off_t phys_off, end, cluster_off; 331f224f92aSccardenas ssize_t sz, rem; 332f224f92aSccardenas 333f224f92aSccardenas disk = p; 334f224f92aSccardenas end = off + len; 335f224f92aSccardenas if (off < 0 || end > disk->disksz) 336f224f92aSccardenas return -1; 337f224f92aSccardenas 338f224f92aSccardenas /* handle head chunk separately */ 339f224f92aSccardenas rem = len; 340f224f92aSccardenas while (off != end) { 341f224f92aSccardenas for (d = disk; d; d = d->base) 342f224f92aSccardenas if ((phys_off = xlate(d, off, NULL)) > 0) 343f224f92aSccardenas break; 344f224f92aSccardenas /* Break out into chunks. This handles 345f224f92aSccardenas * three cases: 346f224f92aSccardenas * 34707e1a8caSori * |----+====|========|====+-----| 348f224f92aSccardenas * 349f224f92aSccardenas * Either we are at the start of the read, 350f224f92aSccardenas * and the cluster has some leading bytes. 351f224f92aSccardenas * This means that we are reading the tail 352f224f92aSccardenas * of the cluster, and our size is: 353f224f92aSccardenas * 354f224f92aSccardenas * clustersz - (off % clustersz). 355f224f92aSccardenas * 356f224f92aSccardenas * Otherwise, we're reading the middle section. 357f224f92aSccardenas * We're already aligned here, so we can just 358f224f92aSccardenas * read the whole cluster size. Or we're at the 359f224f92aSccardenas * tail, at which point we just want to read the 360f224f92aSccardenas * remaining bytes. 361f224f92aSccardenas */ 362f224f92aSccardenas cluster_off = off % disk->clustersz; 363f224f92aSccardenas sz = disk->clustersz - cluster_off; 364f224f92aSccardenas if (sz > rem) 365f224f92aSccardenas sz = rem; 366f224f92aSccardenas /* 367f224f92aSccardenas * If we're within the disk, but don't have backing bytes, 368f224f92aSccardenas * just read back zeros. 369f224f92aSccardenas */ 370f224f92aSccardenas if (!d) 371f224f92aSccardenas bzero(buf, sz); 372f224f92aSccardenas else if (pread(d->fd, buf, sz, phys_off) != sz) 373f224f92aSccardenas return -1; 374f224f92aSccardenas off += sz; 375f224f92aSccardenas buf += sz; 376f224f92aSccardenas rem -= sz; 377f224f92aSccardenas } 378f224f92aSccardenas return len; 379f224f92aSccardenas } 380f224f92aSccardenas 38120e554f8Sdv static ssize_t 38220e554f8Sdv qc2_pwritev(void *p, struct iovec *iov, int cnt, off_t offset) 38320e554f8Sdv { 38420e554f8Sdv int i; 38520e554f8Sdv off_t pos = offset; 38620e554f8Sdv ssize_t sz = 0, total = 0; 38720e554f8Sdv 38820e554f8Sdv for (i = 0; i < cnt; i++, iov++) { 38920e554f8Sdv sz = qc2_pwrite(p, iov->iov_base, iov->iov_len, pos); 39020e554f8Sdv if (sz == -1) 39120e554f8Sdv return (sz); 39220e554f8Sdv total += sz; 39320e554f8Sdv pos += sz; 39420e554f8Sdv } 39520e554f8Sdv 39620e554f8Sdv return (total); 39720e554f8Sdv } 39820e554f8Sdv 39920e554f8Sdv static ssize_t 400f224f92aSccardenas qc2_pwrite(void *p, char *buf, size_t len, off_t off) 401f224f92aSccardenas { 402f224f92aSccardenas struct qcdisk *disk, *d; 403f224f92aSccardenas off_t phys_off, cluster_off, end; 404f224f92aSccardenas ssize_t sz, rem; 405f224f92aSccardenas int inplace; 406f224f92aSccardenas 407f224f92aSccardenas d = p; 408f224f92aSccardenas disk = p; 409f224f92aSccardenas inplace = 1; 410f224f92aSccardenas end = off + len; 411f224f92aSccardenas if (off < 0 || end > disk->disksz) 412f224f92aSccardenas return -1; 413f224f92aSccardenas rem = len; 414f224f92aSccardenas while (off != end) { 415f224f92aSccardenas /* See the read code for a summary of the computation */ 416f224f92aSccardenas cluster_off = off % disk->clustersz; 417f224f92aSccardenas sz = disk->clustersz - cluster_off; 418f224f92aSccardenas if (sz > rem) 419f224f92aSccardenas sz = rem; 420f224f92aSccardenas 421f224f92aSccardenas phys_off = xlate(disk, off, &inplace); 422f224f92aSccardenas if (phys_off == -1) 423f224f92aSccardenas return -1; 424f224f92aSccardenas /* 425f224f92aSccardenas * If we couldn't find the cluster in the writable disk, 426f224f92aSccardenas * see if it exists in the base image. If it does, we 427f224f92aSccardenas * need to copy it before the write. The copy happens 428f224f92aSccardenas * in the '!inplace' if clause below te search. 429f224f92aSccardenas */ 430f224f92aSccardenas if (phys_off == 0) 431f224f92aSccardenas for (d = disk->base; d; d = d->base) 432f224f92aSccardenas if ((phys_off = xlate(d, off, NULL)) > 0) 433f224f92aSccardenas break; 434f224f92aSccardenas if (!inplace || phys_off == 0) 435f224f92aSccardenas phys_off = mkcluster(disk, d, off, phys_off); 436f224f92aSccardenas if (phys_off == -1) 437f224f92aSccardenas return -1; 43807e1a8caSori if (phys_off < disk->clustersz) 43907e1a8caSori fatalx("%s: writing reserved cluster", __func__); 440f224f92aSccardenas if (pwrite(disk->fd, buf, sz, phys_off) != sz) 441f224f92aSccardenas return -1; 442f224f92aSccardenas off += sz; 443f224f92aSccardenas buf += sz; 444f224f92aSccardenas rem -= sz; 445f224f92aSccardenas } 446f224f92aSccardenas return len; 447f224f92aSccardenas } 448f224f92aSccardenas 449f224f92aSccardenas static void 450f6c09be3Sreyk qc2_close(void *p, int stayopen) 451f224f92aSccardenas { 452f224f92aSccardenas struct qcdisk *disk; 453f224f92aSccardenas 454f224f92aSccardenas disk = p; 45550bebf2cSccardenas if (disk->base) 456f6c09be3Sreyk qc2_close(disk->base, stayopen); 457f6c09be3Sreyk if (!stayopen) 458f224f92aSccardenas close(disk->fd); 45950bebf2cSccardenas free(disk->l1); 460f224f92aSccardenas free(disk); 461f224f92aSccardenas } 462f224f92aSccardenas 463f224f92aSccardenas /* 464f224f92aSccardenas * Translates a virtual offset into an on-disk offset. 465f224f92aSccardenas * Returns: 466f224f92aSccardenas * -1 on error 467f224f92aSccardenas * 0 on 'not found' 468f224f92aSccardenas * >0 on found 469f224f92aSccardenas */ 470f224f92aSccardenas static off_t 471f224f92aSccardenas xlate(struct qcdisk *disk, off_t off, int *inplace) 472f224f92aSccardenas { 473f224f92aSccardenas off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff; 474f224f92aSccardenas uint64_t buf; 475f224f92aSccardenas 476f224f92aSccardenas 477f224f92aSccardenas /* 478f224f92aSccardenas * Clear out inplace flag -- xlate misses should not 479f224f92aSccardenas * be flagged as updatable in place. We will still 480f224f92aSccardenas * return 0 from them, but this leaves less surprises 481f224f92aSccardenas * in the API. 482f224f92aSccardenas */ 483f224f92aSccardenas if (inplace) 484f224f92aSccardenas *inplace = 0; 485f224f92aSccardenas pthread_rwlock_rdlock(&disk->lock); 486f224f92aSccardenas if (off < 0) 487f224f92aSccardenas goto err; 488f224f92aSccardenas 489f224f92aSccardenas l2sz = disk->clustersz / 8; 490f224f92aSccardenas l1off = (off / disk->clustersz) / l2sz; 491f224f92aSccardenas if (l1off >= disk->l1sz) 492f224f92aSccardenas goto err; 493f224f92aSccardenas 494f224f92aSccardenas l2tab = disk->l1[l1off]; 495f224f92aSccardenas l2tab &= ~QCOW2_INPLACE; 496f224f92aSccardenas if (l2tab == 0) { 497f224f92aSccardenas pthread_rwlock_unlock(&disk->lock); 498f224f92aSccardenas return 0; 499f224f92aSccardenas } 500f224f92aSccardenas l2off = (off / disk->clustersz) % l2sz; 501f224f92aSccardenas pread(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8); 502f224f92aSccardenas cluster = be64toh(buf); 503f224f92aSccardenas /* 504f224f92aSccardenas * cluster may be 0, but all future operations don't affect 505f224f92aSccardenas * the return value. 506f224f92aSccardenas */ 507f224f92aSccardenas if (inplace) 508f224f92aSccardenas *inplace = !!(cluster & QCOW2_INPLACE); 50907e1a8caSori if (cluster & QCOW2_COMPRESSED) 51007e1a8caSori fatalx("%s: compressed clusters unsupported", __func__); 511f224f92aSccardenas pthread_rwlock_unlock(&disk->lock); 512f224f92aSccardenas clusteroff = 0; 513f224f92aSccardenas cluster &= ~QCOW2_INPLACE; 514f224f92aSccardenas if (cluster) 515f224f92aSccardenas clusteroff = off % disk->clustersz; 516f224f92aSccardenas return cluster + clusteroff; 517f224f92aSccardenas err: 518f224f92aSccardenas pthread_rwlock_unlock(&disk->lock); 519f224f92aSccardenas return -1; 520f224f92aSccardenas } 521f224f92aSccardenas 522f224f92aSccardenas /* 523f224f92aSccardenas * Allocates a new cluster on disk, creating a new L2 table 524f224f92aSccardenas * if needed. The cluster starts off with a refs of one, 525f224f92aSccardenas * and the writable bit set. 526f224f92aSccardenas * 527f224f92aSccardenas * Returns -1 on error, and the physical address within the 528f224f92aSccardenas * cluster of the write offset if it exists. 529f224f92aSccardenas */ 530f224f92aSccardenas static off_t 531f224f92aSccardenas mkcluster(struct qcdisk *disk, struct qcdisk *base, off_t off, off_t src_phys) 532f224f92aSccardenas { 533f224f92aSccardenas off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff, orig; 534f224f92aSccardenas uint64_t buf; 535f224f92aSccardenas 536f224f92aSccardenas pthread_rwlock_wrlock(&disk->lock); 537f224f92aSccardenas 538f224f92aSccardenas cluster = -1; 539f224f92aSccardenas /* L1 entries always exist */ 540f224f92aSccardenas l2sz = disk->clustersz / 8; 541f224f92aSccardenas l1off = off / (disk->clustersz * l2sz); 542f224f92aSccardenas if (l1off >= disk->l1sz) 54307e1a8caSori fatalx("l1 offset outside disk"); 544f224f92aSccardenas 545f224f92aSccardenas disk->end = (disk->end + disk->clustersz - 1) & ~(disk->clustersz - 1); 546f224f92aSccardenas 547f224f92aSccardenas l2tab = disk->l1[l1off]; 548f224f92aSccardenas l2off = (off / disk->clustersz) % l2sz; 549f224f92aSccardenas /* We may need to create or clone an L2 entry to map the block */ 550f224f92aSccardenas if (l2tab == 0 || (l2tab & QCOW2_INPLACE) == 0) { 551f224f92aSccardenas orig = l2tab & ~QCOW2_INPLACE; 552f224f92aSccardenas l2tab = disk->end; 553f224f92aSccardenas disk->end += disk->clustersz; 55407e1a8caSori if (ftruncate(disk->fd, disk->end) == -1) 55507e1a8caSori fatal("%s: ftruncate failed", __func__); 556f224f92aSccardenas 557f224f92aSccardenas /* 558f224f92aSccardenas * If we translated, found a L2 entry, but it needed to 559f224f92aSccardenas * be copied, copy it. 560f224f92aSccardenas */ 56107e1a8caSori if (orig != 0) 56207e1a8caSori copy_cluster(disk, disk, l2tab, orig); 563f224f92aSccardenas /* Update l1 -- we flush it later */ 564f224f92aSccardenas disk->l1[l1off] = l2tab | QCOW2_INPLACE; 56507e1a8caSori inc_refs(disk, l2tab, 1); 566f224f92aSccardenas } 567f224f92aSccardenas l2tab &= ~QCOW2_INPLACE; 568f224f92aSccardenas 569f224f92aSccardenas /* Grow the disk */ 570f224f92aSccardenas if (ftruncate(disk->fd, disk->end + disk->clustersz) < 0) 5714a1c7b02Sdv fatal("%s: could not grow disk", __func__); 572f224f92aSccardenas if (src_phys > 0) 57307e1a8caSori copy_cluster(disk, base, disk->end, src_phys); 574f224f92aSccardenas cluster = disk->end; 575f224f92aSccardenas disk->end += disk->clustersz; 576f224f92aSccardenas buf = htobe64(cluster | QCOW2_INPLACE); 57773613953Sreyk if (pwrite(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8) != 8) 57807e1a8caSori fatalx("%s: could not write cluster", __func__); 579f224f92aSccardenas 580f224f92aSccardenas /* TODO: lazily sync: currently VMD doesn't close things */ 581f224f92aSccardenas buf = htobe64(disk->l1[l1off]); 58273613953Sreyk if (pwrite(disk->fd, &buf, sizeof(buf), disk->l1off + 8 * l1off) != 8) 58307e1a8caSori fatalx("%s: could not write l1", __func__); 58407e1a8caSori inc_refs(disk, cluster, 1); 585f224f92aSccardenas 586f224f92aSccardenas pthread_rwlock_unlock(&disk->lock); 587f224f92aSccardenas clusteroff = off % disk->clustersz; 58807e1a8caSori if (cluster + clusteroff < disk->clustersz) 58907e1a8caSori fatalx("write would clobber header"); 590f224f92aSccardenas return cluster + clusteroff; 591f224f92aSccardenas } 592f224f92aSccardenas 593f224f92aSccardenas /* Copies a cluster containing src to dst. Src and dst need not be aligned. */ 59407e1a8caSori static void 595f224f92aSccardenas copy_cluster(struct qcdisk *disk, struct qcdisk *base, off_t dst, off_t src) 596f224f92aSccardenas { 597f224f92aSccardenas char *scratch; 598f224f92aSccardenas 599fb6f09faSderaadt scratch = malloc(disk->clustersz); 600f224f92aSccardenas if (!scratch) 60107e1a8caSori fatal("out of memory"); 602f224f92aSccardenas src &= ~(disk->clustersz - 1); 603f224f92aSccardenas dst &= ~(disk->clustersz - 1); 604f224f92aSccardenas if (pread(base->fd, scratch, disk->clustersz, src) == -1) 60507e1a8caSori fatal("%s: could not read cluster", __func__); 606f224f92aSccardenas if (pwrite(disk->fd, scratch, disk->clustersz, dst) == -1) 60707e1a8caSori fatal("%s: could not write cluster", __func__); 608fb6f09faSderaadt free(scratch); 609f224f92aSccardenas } 610f224f92aSccardenas 61107e1a8caSori static void 612f224f92aSccardenas inc_refs(struct qcdisk *disk, off_t off, int newcluster) 613f224f92aSccardenas { 614f224f92aSccardenas off_t l1off, l1idx, l2idx, l2cluster; 615f224f92aSccardenas size_t nper; 616f224f92aSccardenas uint16_t refs; 617f224f92aSccardenas uint64_t buf; 618f224f92aSccardenas 619f224f92aSccardenas off &= ~QCOW2_INPLACE; 620f224f92aSccardenas nper = disk->clustersz / 2; 621f224f92aSccardenas l1idx = (off / disk->clustersz) / nper; 622f224f92aSccardenas l2idx = (off / disk->clustersz) % nper; 623f224f92aSccardenas l1off = disk->refoff + 8 * l1idx; 62473613953Sreyk if (pread(disk->fd, &buf, sizeof(buf), l1off) != 8) 62507e1a8caSori fatal("could not read refs"); 626f224f92aSccardenas 627f224f92aSccardenas l2cluster = be64toh(buf); 628f224f92aSccardenas if (l2cluster == 0) { 629f224f92aSccardenas l2cluster = disk->end; 630f224f92aSccardenas disk->end += disk->clustersz; 63107e1a8caSori if (ftruncate(disk->fd, disk->end) < 0) 63207e1a8caSori fatal("%s: failed to allocate ref block", __func__); 633f224f92aSccardenas buf = htobe64(l2cluster); 63407e1a8caSori if (pwrite(disk->fd, &buf, sizeof(buf), l1off) != 8) 63507e1a8caSori fatal("%s: failed to write ref block", __func__); 636f224f92aSccardenas } 637f224f92aSccardenas 638f224f92aSccardenas refs = 1; 639f224f92aSccardenas if (!newcluster) { 64073613953Sreyk if (pread(disk->fd, &refs, sizeof(refs), 64173613953Sreyk l2cluster + 2 * l2idx) != 2) 64207e1a8caSori fatal("could not read ref cluster"); 643f224f92aSccardenas refs = be16toh(refs) + 1; 644f224f92aSccardenas } 645f224f92aSccardenas refs = htobe16(refs); 64607e1a8caSori if (pwrite(disk->fd, &refs, sizeof(refs), l2cluster + 2 * l2idx) != 2) 64707e1a8caSori fatal("%s: could not write ref block", __func__); 648f224f92aSccardenas } 649f224f92aSccardenas 65062df93eeSreyk /* 65162df93eeSreyk * virtio_qcow2_create 65262df93eeSreyk * 65362df93eeSreyk * Create an empty qcow2 imagefile with the specified path and size. 65462df93eeSreyk * 65562df93eeSreyk * Parameters: 65662df93eeSreyk * imgfile_path: path to the image file to create 6572eec0843Sdv * imgsize : size of the image file to create (in bytes) 65862df93eeSreyk * 65962df93eeSreyk * Return: 66062df93eeSreyk * EEXIST: The requested image file already exists 66162df93eeSreyk * 0 : Image file successfully created 66262df93eeSreyk * Exxxx : Various other Exxxx errno codes due to other I/O errors 66362df93eeSreyk */ 66462df93eeSreyk int 66562df93eeSreyk virtio_qcow2_create(const char *imgfile_path, 666ead1b146Sdv const char *base_path, uint64_t disksz) 66762df93eeSreyk { 668702b4317Sdv struct qcheader hdr, basehdr; 66962df93eeSreyk int fd, ret; 67062df93eeSreyk ssize_t base_len; 671ead1b146Sdv uint64_t l1sz, refsz, initsz, clustersz; 67262df93eeSreyk uint64_t l1off, refoff, v, i, l1entrysz, refentrysz; 67362df93eeSreyk uint16_t refs; 67462df93eeSreyk 67562df93eeSreyk if (base_path) { 67662df93eeSreyk fd = open(base_path, O_RDONLY); 67762df93eeSreyk if (read(fd, &basehdr, sizeof(basehdr)) != sizeof(basehdr)) 6784a1c7b02Sdv errx(1, "failure to read base image header"); 67962df93eeSreyk close(fd); 68062df93eeSreyk if (strncmp(basehdr.magic, 68162df93eeSreyk VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0) 68262df93eeSreyk errx(1, "base image is not a qcow2 file"); 68362df93eeSreyk if (!disksz) 68462df93eeSreyk disksz = betoh64(basehdr.disksz); 68562df93eeSreyk else if (disksz != betoh64(basehdr.disksz)) 68662df93eeSreyk errx(1, "base size does not match requested size"); 68762df93eeSreyk } 68862df93eeSreyk if (!base_path && !disksz) 68962df93eeSreyk errx(1, "missing disk size"); 69062df93eeSreyk 69162df93eeSreyk clustersz = (1<<16); 69262df93eeSreyk l1off = ALIGNSZ(sizeof(hdr), clustersz); 69362df93eeSreyk 69462df93eeSreyk l1entrysz = clustersz * clustersz / 8; 69562df93eeSreyk l1sz = (disksz + l1entrysz - 1) / l1entrysz; 69662df93eeSreyk 69762df93eeSreyk refoff = ALIGNSZ(l1off + 8*l1sz, clustersz); 69862df93eeSreyk refentrysz = clustersz * clustersz * clustersz / 2; 69962df93eeSreyk refsz = (disksz + refentrysz - 1) / refentrysz; 70062df93eeSreyk 70162df93eeSreyk initsz = ALIGNSZ(refoff + refsz*clustersz, clustersz); 70262df93eeSreyk base_len = base_path ? strlen(base_path) : 0; 70362df93eeSreyk 70462df93eeSreyk memcpy(hdr.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)); 70562df93eeSreyk hdr.version = htobe32(3); 70662df93eeSreyk hdr.backingoff = htobe64(base_path ? sizeof(hdr) : 0); 70762df93eeSreyk hdr.backingsz = htobe32(base_len); 70862df93eeSreyk hdr.clustershift = htobe32(16); 70962df93eeSreyk hdr.disksz = htobe64(disksz); 71062df93eeSreyk hdr.cryptmethod = htobe32(0); 71162df93eeSreyk hdr.l1sz = htobe32(l1sz); 71262df93eeSreyk hdr.l1off = htobe64(l1off); 71362df93eeSreyk hdr.refoff = htobe64(refoff); 71462df93eeSreyk hdr.refsz = htobe32(refsz); 71562df93eeSreyk hdr.snapcount = htobe32(0); 71662df93eeSreyk hdr.snapsz = htobe64(0); 71762df93eeSreyk hdr.incompatfeatures = htobe64(0); 71862df93eeSreyk hdr.compatfeatures = htobe64(0); 71962df93eeSreyk hdr.autoclearfeatures = htobe64(0); 72062df93eeSreyk hdr.reforder = htobe32(4); 72162df93eeSreyk hdr.headersz = htobe32(sizeof(hdr)); 72262df93eeSreyk 72362df93eeSreyk /* Refuse to overwrite an existing image */ 72462df93eeSreyk fd = open(imgfile_path, O_RDWR | O_CREAT | O_TRUNC | O_EXCL, 72562df93eeSreyk S_IRUSR | S_IWUSR); 72662df93eeSreyk if (fd == -1) 72762df93eeSreyk return (errno); 72862df93eeSreyk 72962df93eeSreyk /* Write out the header */ 73062df93eeSreyk if (write(fd, &hdr, sizeof(hdr)) != sizeof(hdr)) 73162df93eeSreyk goto error; 73262df93eeSreyk 73362df93eeSreyk /* Add the base image */ 73462df93eeSreyk if (base_path && write(fd, base_path, base_len) != base_len) 73562df93eeSreyk goto error; 73662df93eeSreyk 73762df93eeSreyk /* Extend to desired size, and add one refcount cluster */ 73862df93eeSreyk if (ftruncate(fd, (off_t)initsz + clustersz) == -1) 73962df93eeSreyk goto error; 74062df93eeSreyk 74162df93eeSreyk /* 74262df93eeSreyk * Paranoia: if our disk image takes more than one cluster 74362df93eeSreyk * to refcount the initial image, fail. 74462df93eeSreyk */ 74562df93eeSreyk if (initsz/clustersz > clustersz/2) { 74662df93eeSreyk errno = ERANGE; 74762df93eeSreyk goto error; 74862df93eeSreyk } 74962df93eeSreyk 75062df93eeSreyk /* Add a refcount block, and refcount ourselves. */ 75162df93eeSreyk v = htobe64(initsz); 75262df93eeSreyk if (pwrite(fd, &v, 8, refoff) != 8) 75362df93eeSreyk goto error; 75462df93eeSreyk for (i = 0; i < initsz/clustersz + 1; i++) { 75562df93eeSreyk refs = htobe16(1); 75662df93eeSreyk if (pwrite(fd, &refs, 2, initsz + 2*i) != 2) 75762df93eeSreyk goto error; 75862df93eeSreyk } 75962df93eeSreyk 76062df93eeSreyk ret = close(fd); 76162df93eeSreyk return (ret); 76262df93eeSreyk error: 76362df93eeSreyk ret = errno; 76462df93eeSreyk close(fd); 76562df93eeSreyk unlink(imgfile_path); 76662df93eeSreyk return (errno); 76762df93eeSreyk } 768