xref: /openbsd-src/usr.sbin/vmd/vioqcow2.c (revision 65bbee46cad7861cd5a570f338df9e976422e3ab)
1*65bbee46Sjsg /*	$OpenBSD: vioqcow2.c,v 1.25 2024/09/26 01:45:13 jsg Exp $	*/
2f224f92aSccardenas 
3f224f92aSccardenas /*
4f224f92aSccardenas  * Copyright (c) 2018 Ori Bernstein <ori@eigenstate.org>
5f224f92aSccardenas  *
6f224f92aSccardenas  * Permission to use, copy, modify, and distribute this software for any
7f224f92aSccardenas  * purpose with or without fee is hereby granted, provided that the above
8f224f92aSccardenas  * copyright notice and this permission notice appear in all copies.
9f224f92aSccardenas  *
10f224f92aSccardenas  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11f224f92aSccardenas  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12f224f92aSccardenas  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13f224f92aSccardenas  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14f224f92aSccardenas  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15f224f92aSccardenas  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16f224f92aSccardenas  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17f224f92aSccardenas  */
18f224f92aSccardenas 
19f224f92aSccardenas #include <sys/types.h>
20f224f92aSccardenas #include <sys/stat.h>
21f224f92aSccardenas 
226eb4c859Sdv #include <err.h>
236eb4c859Sdv #include <errno.h>
246eb4c859Sdv #include <fcntl.h>
256eb4c859Sdv #include <libgen.h>
26f224f92aSccardenas #include <stdlib.h>
27f224f92aSccardenas #include <string.h>
28f224f92aSccardenas #include <unistd.h>
29f224f92aSccardenas 
30f224f92aSccardenas #include "virtio.h"
31f224f92aSccardenas 
32f224f92aSccardenas #define QCOW2_COMPRESSED	0x4000000000000000ull
33f224f92aSccardenas #define QCOW2_INPLACE		0x8000000000000000ull
34f224f92aSccardenas 
35f224f92aSccardenas #define QCOW2_DIRTY		(1 << 0)
36f224f92aSccardenas #define QCOW2_CORRUPT		(1 << 1)
37f224f92aSccardenas 
38f224f92aSccardenas enum {
39f224f92aSccardenas 	ICFEATURE_DIRTY		= 1 << 0,
40f224f92aSccardenas 	ICFEATURE_CORRUPT	= 1 << 1,
41f224f92aSccardenas };
42f224f92aSccardenas 
43f224f92aSccardenas enum {
44f224f92aSccardenas 	ACFEATURE_BITEXT	= 1 << 0,
45f224f92aSccardenas };
46f224f92aSccardenas 
47f224f92aSccardenas struct qcheader {
48f224f92aSccardenas 	char magic[4];
49f224f92aSccardenas 	uint32_t version;
50f224f92aSccardenas 	uint64_t backingoff;
51f224f92aSccardenas 	uint32_t backingsz;
52f224f92aSccardenas 	uint32_t clustershift;
53f224f92aSccardenas 	uint64_t disksz;
54f224f92aSccardenas 	uint32_t cryptmethod;
55f224f92aSccardenas 	uint32_t l1sz;
56f224f92aSccardenas 	uint64_t l1off;
57f224f92aSccardenas 	uint64_t refoff;
58f224f92aSccardenas 	uint32_t refsz;
59f224f92aSccardenas 	uint32_t snapcount;
60f224f92aSccardenas 	uint64_t snapsz;
61f224f92aSccardenas 	/* v3 additions */
62f224f92aSccardenas 	uint64_t incompatfeatures;
63f224f92aSccardenas 	uint64_t compatfeatures;
64f224f92aSccardenas 	uint64_t autoclearfeatures;
65f224f92aSccardenas 	uint32_t reforder;	/* Bits = 1 << reforder */
66f224f92aSccardenas 	uint32_t headersz;
67f224f92aSccardenas } __packed;
68f224f92aSccardenas 
69f224f92aSccardenas struct qcdisk {
70f224f92aSccardenas 	pthread_rwlock_t lock;
71f224f92aSccardenas 	struct qcdisk *base;
72f224f92aSccardenas 	struct qcheader header;
73f224f92aSccardenas 
74f224f92aSccardenas 	int       fd;
75f224f92aSccardenas 	uint64_t *l1;
76f224f92aSccardenas 	off_t     end;
77ed9943e8Sori 	off_t	  clustersz;
78f224f92aSccardenas 	off_t	  disksz; /* In bytes */
79f224f92aSccardenas 	uint32_t  cryptmethod;
80f224f92aSccardenas 
81f224f92aSccardenas 	uint32_t l1sz;
82f224f92aSccardenas 	off_t	 l1off;
83f224f92aSccardenas 
84f224f92aSccardenas 	off_t	 refoff;
85ed9943e8Sori 	off_t	 refsz;
86f224f92aSccardenas 
87f224f92aSccardenas 	uint32_t nsnap;
88f224f92aSccardenas 	off_t	 snapoff;
89f224f92aSccardenas 
90f224f92aSccardenas 	/* v3 features */
91f224f92aSccardenas 	uint64_t incompatfeatures;
92f224f92aSccardenas 	uint64_t autoclearfeatures;
93f224f92aSccardenas 	uint32_t refssz;
94f224f92aSccardenas 	uint32_t headersz;
95f224f92aSccardenas };
96f224f92aSccardenas 
97f224f92aSccardenas extern char *__progname;
98f224f92aSccardenas 
99f224f92aSccardenas static off_t xlate(struct qcdisk *, off_t, int *);
10007e1a8caSori static void copy_cluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
10107e1a8caSori static void inc_refs(struct qcdisk *, off_t, int);
102f224f92aSccardenas static off_t mkcluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
10373613953Sreyk static int qc2_open(struct qcdisk *, int *, size_t);
104f224f92aSccardenas static ssize_t qc2_pread(void *, char *, size_t, off_t);
10520e554f8Sdv static ssize_t qc2_preadv(void *, struct iovec *, int, off_t);
106f224f92aSccardenas static ssize_t qc2_pwrite(void *, char *, size_t, off_t);
10720e554f8Sdv static ssize_t qc2_pwritev(void *, struct iovec *, int, off_t);
108f6c09be3Sreyk static void qc2_close(void *, int);
109f224f92aSccardenas 
110f224f92aSccardenas /*
1113481ecdfSdv  * Initializes a raw disk image backing file from an fd. Stores the
1123481ecdfSdv  * number of bytes in *szp, returning -1 for error, 0 for success.
113f224f92aSccardenas  *
114f224f92aSccardenas  * May open snapshot base images.
115f224f92aSccardenas  */
116f224f92aSccardenas int
11762df93eeSreyk virtio_qcow2_init(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd)
118f224f92aSccardenas {
119f224f92aSccardenas 	struct qcdisk *diskp;
120f224f92aSccardenas 
121f224f92aSccardenas 	diskp = malloc(sizeof(struct qcdisk));
122f224f92aSccardenas 	if (diskp == NULL)
123f224f92aSccardenas 		return -1;
12473613953Sreyk 	if (qc2_open(diskp, fd, nfd) == -1) {
12507e1a8caSori 		log_warnx("could not open qcow2 disk");
126f224f92aSccardenas 		return -1;
127f224f92aSccardenas 	}
128f224f92aSccardenas 	file->p = diskp;
129f224f92aSccardenas 	file->pread = qc2_pread;
13020e554f8Sdv 	file->preadv = qc2_preadv;
131f224f92aSccardenas 	file->pwrite = qc2_pwrite;
13220e554f8Sdv 	file->pwritev = qc2_pwritev;
133f224f92aSccardenas 	file->close = qc2_close;
134a3500374Sasou 	*szp = diskp->disksz;
135f224f92aSccardenas 	return 0;
136f224f92aSccardenas }
137f224f92aSccardenas 
13807e1a8caSori /*
13907e1a8caSori  * Return the path to the base image given a disk image.
14007e1a8caSori  * Called from vmctl.
14107e1a8caSori  */
14273613953Sreyk ssize_t
1434d2a1fb2Sreyk virtio_qcow2_get_base(int fd, char *path, size_t npath, const char *dpath)
144f224f92aSccardenas {
1450d8d8a26Snaddy 	char dpathbuf[PATH_MAX];
1464d2a1fb2Sreyk 	char expanded[PATH_MAX];
14773613953Sreyk 	struct qcheader header;
14873613953Sreyk 	uint64_t backingoff;
14973613953Sreyk 	uint32_t backingsz;
1504d2a1fb2Sreyk 	char *s = NULL;
151f224f92aSccardenas 
15273613953Sreyk 	if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) {
15307e1a8caSori 		log_warnx("short read on header");
154f224f92aSccardenas 		return -1;
15573613953Sreyk 	}
15673613953Sreyk 	if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0) {
15707e1a8caSori 		log_warnx("invalid magic numbers");
15873613953Sreyk 		return -1;
15973613953Sreyk 	}
16073613953Sreyk 	backingoff = be64toh(header.backingoff);
16173613953Sreyk 	backingsz = be32toh(header.backingsz);
1624d2a1fb2Sreyk 	if (backingsz == 0)
1634d2a1fb2Sreyk 		return 0;
1644d2a1fb2Sreyk 
16573613953Sreyk 	if (backingsz >= npath - 1) {
16607e1a8caSori 		log_warnx("snapshot path too long");
16773613953Sreyk 		return -1;
16873613953Sreyk 	}
16973613953Sreyk 	if (pread(fd, path, backingsz, backingoff) != backingsz) {
17007e1a8caSori 		log_warnx("could not read snapshot base name");
17173613953Sreyk 		return -1;
17273613953Sreyk 	}
17373613953Sreyk 	path[backingsz] = '\0';
1744d2a1fb2Sreyk 
1754d2a1fb2Sreyk 	/*
1764d2a1fb2Sreyk 	 * Relative paths should be interpreted relative to the disk image,
1774d2a1fb2Sreyk 	 * rather than relative to the directory vmd happens to be running in,
1782eec0843Sdv 	 * since this is the only useful interpretation.
1794d2a1fb2Sreyk 	 */
1804d2a1fb2Sreyk 	if (path[0] == '/') {
1814d2a1fb2Sreyk 		if (realpath(path, expanded) == NULL ||
1824d2a1fb2Sreyk 		    strlcpy(path, expanded, npath) >= npath) {
18307e1a8caSori 			log_warnx("unable to resolve %s", path);
1844d2a1fb2Sreyk 			return -1;
18573613953Sreyk 		}
1864d2a1fb2Sreyk 	} else {
1870d8d8a26Snaddy 		if (strlcpy(dpathbuf, dpath, sizeof(dpathbuf)) >=
1880d8d8a26Snaddy 		    sizeof(dpathbuf)) {
1890d8d8a26Snaddy 			log_warnx("path too long: %s", dpath);
1900d8d8a26Snaddy 			return -1;
1910d8d8a26Snaddy 		}
1920d8d8a26Snaddy 		s = dirname(dpathbuf);
1934d2a1fb2Sreyk 		if (snprintf(expanded, sizeof(expanded),
1944d2a1fb2Sreyk 		    "%s/%s", s, path) >= (int)sizeof(expanded)) {
19507e1a8caSori 			log_warnx("path too long: %s/%s", s, path);
1964d2a1fb2Sreyk 			return -1;
1974d2a1fb2Sreyk 		}
1984d2a1fb2Sreyk 		if (npath < PATH_MAX ||
1994d2a1fb2Sreyk 		    realpath(expanded, path) == NULL) {
20007e1a8caSori 			log_warnx("unable to resolve %s", path);
2014d2a1fb2Sreyk 			return -1;
2024d2a1fb2Sreyk 		}
2034d2a1fb2Sreyk 	}
2044d2a1fb2Sreyk 
2054d2a1fb2Sreyk 	return strlen(path);
206f224f92aSccardenas }
207f224f92aSccardenas 
208f224f92aSccardenas static int
20973613953Sreyk qc2_open(struct qcdisk *disk, int *fds, size_t nfd)
210f224f92aSccardenas {
211f224f92aSccardenas 	char basepath[PATH_MAX];
212f224f92aSccardenas 	struct stat st;
213f224f92aSccardenas 	struct qcheader header;
214f224f92aSccardenas 	uint64_t backingoff;
215f224f92aSccardenas 	uint32_t backingsz;
216ed9943e8Sori 	off_t i;
21773613953Sreyk 	int version, fd;
218f224f92aSccardenas 
219f224f92aSccardenas 	pthread_rwlock_init(&disk->lock, NULL);
22073613953Sreyk 	fd = fds[0];
221f224f92aSccardenas 	disk->fd = fd;
222f224f92aSccardenas 	disk->base = NULL;
22350bebf2cSccardenas 	disk->l1 = NULL;
22450bebf2cSccardenas 
22507e1a8caSori 	if (pread(fd, &header, sizeof(header), 0) != sizeof(header))
22607e1a8caSori 		fatalx("short read on header");
22707e1a8caSori 	if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0)
22807e1a8caSori 		fatalx("invalid magic numbers");
229f224f92aSccardenas 
230f224f92aSccardenas 	disk->clustersz		= (1ull << be32toh(header.clustershift));
231f224f92aSccardenas 	disk->disksz		= be64toh(header.disksz);
232f224f92aSccardenas 	disk->cryptmethod	= be32toh(header.cryptmethod);
233f224f92aSccardenas 	disk->l1sz		= be32toh(header.l1sz);
234f224f92aSccardenas 	disk->l1off		= be64toh(header.l1off);
235f224f92aSccardenas 	disk->refsz		= be32toh(header.refsz);
236f224f92aSccardenas 	disk->refoff		= be64toh(header.refoff);
237f224f92aSccardenas 	disk->nsnap		= be32toh(header.snapcount);
238f224f92aSccardenas 	disk->snapoff		= be64toh(header.snapsz);
23950bebf2cSccardenas 
240f224f92aSccardenas 	/*
241f224f92aSccardenas 	 * The additional features here are defined as 0 in the v2 format,
242f224f92aSccardenas 	 * so as long as we clear the buffer before parsing, we don't need
243f224f92aSccardenas 	 * to check versions here.
244f224f92aSccardenas 	 */
245f224f92aSccardenas 	disk->incompatfeatures = be64toh(header.incompatfeatures);
246f224f92aSccardenas 	disk->autoclearfeatures = be64toh(header.autoclearfeatures);
247f224f92aSccardenas 	disk->refssz = be32toh(header.refsz);
248f224f92aSccardenas 	disk->headersz = be32toh(header.headersz);
249f224f92aSccardenas 
250f224f92aSccardenas 	/*
251f224f92aSccardenas 	 * We only know about the dirty or corrupt bits here.
252f224f92aSccardenas 	 */
25307e1a8caSori 	if (disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT))
25407e1a8caSori 		fatalx("unsupported features %llx",
255f224f92aSccardenas 		    disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT));
25607e1a8caSori 	if (be32toh(header.reforder) != 4)
25707e1a8caSori 		fatalx("unsupported refcount size\n");
258f224f92aSccardenas 
25973613953Sreyk 	disk->l1 = calloc(disk->l1sz, sizeof(*disk->l1));
26050bebf2cSccardenas 	if (!disk->l1)
26107e1a8caSori 		fatal("%s: could not allocate l1 table", __func__);
26250bebf2cSccardenas 	if (pread(disk->fd, disk->l1, 8 * disk->l1sz, disk->l1off)
26307e1a8caSori 	    != 8 * disk->l1sz)
26407e1a8caSori 		fatalx("%s: unable to read qcow2 L1 table", __func__);
265f224f92aSccardenas 	for (i = 0; i < disk->l1sz; i++)
266f224f92aSccardenas 		disk->l1[i] = be64toh(disk->l1[i]);
267f224f92aSccardenas 	version = be32toh(header.version);
26807e1a8caSori 	if (version != 2 && version != 3)
26907e1a8caSori 		fatalx("%s: unknown qcow2 version %d", __func__, version);
270f224f92aSccardenas 
271f224f92aSccardenas 	backingoff = be64toh(header.backingoff);
272f224f92aSccardenas 	backingsz = be32toh(header.backingsz);
273f224f92aSccardenas 	if (backingsz != 0) {
27473613953Sreyk 		if (backingsz >= sizeof(basepath) - 1) {
27507e1a8caSori 			fatalx("%s: snapshot path too long", __func__);
276f224f92aSccardenas 		}
277f224f92aSccardenas 		if (pread(fd, basepath, backingsz, backingoff) != backingsz) {
27807e1a8caSori 			fatalx("%s: could not read snapshot base name",
2792919bad8Sccardenas 			    __func__);
280f224f92aSccardenas 		}
281f224f92aSccardenas 		basepath[backingsz] = 0;
28273613953Sreyk 		if (nfd <= 1) {
28307e1a8caSori 			fatalx("%s: missing base image %s", __func__,
28473613953Sreyk 			    basepath);
28573613953Sreyk 		}
28673613953Sreyk 
287f224f92aSccardenas 
288f224f92aSccardenas 		disk->base = calloc(1, sizeof(struct qcdisk));
28950bebf2cSccardenas 		if (!disk->base)
29007e1a8caSori 			fatal("%s: could not open %s", __func__, basepath);
29107e1a8caSori 		if (qc2_open(disk->base, fds + 1, nfd - 1) == -1)
29207e1a8caSori 			fatalx("%s: could not open %s", __func__, basepath);
29307e1a8caSori 		if (disk->base->clustersz != disk->clustersz)
29407e1a8caSori 			fatalx("%s: all disk parts must share clustersize",
2952919bad8Sccardenas 			    __func__);
296f224f92aSccardenas 	}
29707e1a8caSori 	if (fstat(fd, &st) == -1)
29807e1a8caSori 		fatal("%s: unable to stat disk", __func__);
29950bebf2cSccardenas 
300f224f92aSccardenas 	disk->end = st.st_size;
30150bebf2cSccardenas 
302e2d3e60dSreyk 	log_debug("%s: qcow2 disk version %d size %lld end %lld snap %d",
30307e1a8caSori 	    __func__, version, disk->disksz, disk->end, disk->nsnap);
304e2d3e60dSreyk 
305f224f92aSccardenas 	return 0;
306f224f92aSccardenas }
307f224f92aSccardenas 
308f224f92aSccardenas static ssize_t
30920e554f8Sdv qc2_preadv(void *p, struct iovec *iov, int cnt, off_t offset)
31020e554f8Sdv {
31120e554f8Sdv 	int i;
31220e554f8Sdv 	off_t pos = offset;
31320e554f8Sdv 	ssize_t sz = 0, total = 0;
31420e554f8Sdv 
31520e554f8Sdv 	for (i = 0; i < cnt; i++, iov++) {
31620e554f8Sdv 		sz = qc2_pread(p, iov->iov_base, iov->iov_len, pos);
31720e554f8Sdv 		if (sz == -1)
31820e554f8Sdv 			return (sz);
31920e554f8Sdv 		total += sz;
32020e554f8Sdv 		pos += sz;
32120e554f8Sdv 	}
32220e554f8Sdv 
32320e554f8Sdv 	return (total);
32420e554f8Sdv }
32520e554f8Sdv 
32620e554f8Sdv static ssize_t
327f224f92aSccardenas qc2_pread(void *p, char *buf, size_t len, off_t off)
328f224f92aSccardenas {
329f224f92aSccardenas 	struct qcdisk *disk, *d;
330f224f92aSccardenas 	off_t phys_off, end, cluster_off;
331f224f92aSccardenas 	ssize_t sz, rem;
332f224f92aSccardenas 
333f224f92aSccardenas 	disk = p;
334f224f92aSccardenas 	end = off + len;
335f224f92aSccardenas 	if (off < 0 || end > disk->disksz)
336f224f92aSccardenas 		return -1;
337f224f92aSccardenas 
338f224f92aSccardenas 	/* handle head chunk separately */
339f224f92aSccardenas 	rem = len;
340f224f92aSccardenas 	while (off != end) {
341f224f92aSccardenas 		for (d = disk; d; d = d->base)
342f224f92aSccardenas 			if ((phys_off = xlate(d, off, NULL)) > 0)
343f224f92aSccardenas 				break;
344f224f92aSccardenas 		/* Break out into chunks. This handles
345f224f92aSccardenas 		 * three cases:
346f224f92aSccardenas 		 *
34707e1a8caSori 		 *    |----+====|========|====+-----|
348f224f92aSccardenas 		 *
349f224f92aSccardenas 		 * Either we are at the start of the read,
350f224f92aSccardenas 		 * and the cluster has some leading bytes.
351f224f92aSccardenas 		 * This means that we are reading the tail
352f224f92aSccardenas 		 * of the cluster, and our size is:
353f224f92aSccardenas 		 *
354f224f92aSccardenas 		 * 	clustersz - (off % clustersz).
355f224f92aSccardenas 		 *
356f224f92aSccardenas 		 * Otherwise, we're reading the middle section.
357f224f92aSccardenas 		 * We're already aligned here, so we can just
358f224f92aSccardenas 		 * read the whole cluster size. Or we're at the
359f224f92aSccardenas 		 * tail, at which point we just want to read the
360f224f92aSccardenas 		 * remaining bytes.
361f224f92aSccardenas 		 */
362f224f92aSccardenas 		cluster_off = off % disk->clustersz;
363f224f92aSccardenas 		sz = disk->clustersz - cluster_off;
364f224f92aSccardenas 		if (sz > rem)
365f224f92aSccardenas 			sz = rem;
366f224f92aSccardenas 		/*
367f224f92aSccardenas 		 * If we're within the disk, but don't have backing bytes,
368f224f92aSccardenas 		 * just read back zeros.
369f224f92aSccardenas 		 */
370f224f92aSccardenas 		if (!d)
371f224f92aSccardenas 			bzero(buf, sz);
372f224f92aSccardenas 		else if (pread(d->fd, buf, sz, phys_off) != sz)
373f224f92aSccardenas 			return -1;
374f224f92aSccardenas 		off += sz;
375f224f92aSccardenas 		buf += sz;
376f224f92aSccardenas 		rem -= sz;
377f224f92aSccardenas 	}
378f224f92aSccardenas 	return len;
379f224f92aSccardenas }
380f224f92aSccardenas 
38120e554f8Sdv static ssize_t
38220e554f8Sdv qc2_pwritev(void *p, struct iovec *iov, int cnt, off_t offset)
38320e554f8Sdv {
38420e554f8Sdv 	int i;
38520e554f8Sdv 	off_t pos = offset;
38620e554f8Sdv 	ssize_t sz = 0, total = 0;
38720e554f8Sdv 
38820e554f8Sdv 	for (i = 0; i < cnt; i++, iov++) {
38920e554f8Sdv 		sz = qc2_pwrite(p, iov->iov_base, iov->iov_len, pos);
39020e554f8Sdv 		if (sz == -1)
39120e554f8Sdv 			return (sz);
39220e554f8Sdv 		total += sz;
39320e554f8Sdv 		pos += sz;
39420e554f8Sdv 	}
39520e554f8Sdv 
39620e554f8Sdv 	return (total);
39720e554f8Sdv }
39820e554f8Sdv 
39920e554f8Sdv static ssize_t
400f224f92aSccardenas qc2_pwrite(void *p, char *buf, size_t len, off_t off)
401f224f92aSccardenas {
402f224f92aSccardenas 	struct qcdisk *disk, *d;
403f224f92aSccardenas 	off_t phys_off, cluster_off, end;
404f224f92aSccardenas 	ssize_t sz, rem;
405f224f92aSccardenas 	int inplace;
406f224f92aSccardenas 
407f224f92aSccardenas 	d = p;
408f224f92aSccardenas 	disk = p;
409f224f92aSccardenas 	inplace = 1;
410f224f92aSccardenas 	end = off + len;
411f224f92aSccardenas 	if (off < 0 || end > disk->disksz)
412f224f92aSccardenas 		return -1;
413f224f92aSccardenas 	rem = len;
414f224f92aSccardenas 	while (off != end) {
415f224f92aSccardenas 		/* See the read code for a summary of the computation */
416f224f92aSccardenas 		cluster_off = off % disk->clustersz;
417f224f92aSccardenas 		sz = disk->clustersz - cluster_off;
418f224f92aSccardenas 		if (sz > rem)
419f224f92aSccardenas 			sz = rem;
420f224f92aSccardenas 
421f224f92aSccardenas 		phys_off = xlate(disk, off, &inplace);
422f224f92aSccardenas 		if (phys_off == -1)
423f224f92aSccardenas 			return -1;
424f224f92aSccardenas 		/*
425f224f92aSccardenas 		 * If we couldn't find the cluster in the writable disk,
426f224f92aSccardenas 		 * see if it exists in the base image. If it does, we
427f224f92aSccardenas 		 * need to copy it before the write. The copy happens
428f224f92aSccardenas 		 * in the '!inplace' if clause below te search.
429f224f92aSccardenas 		 */
430f224f92aSccardenas 		if (phys_off == 0)
431f224f92aSccardenas 			for (d = disk->base; d; d = d->base)
432f224f92aSccardenas 				if ((phys_off = xlate(d, off, NULL)) > 0)
433f224f92aSccardenas 					break;
434f224f92aSccardenas 		if (!inplace || phys_off == 0)
435f224f92aSccardenas 			phys_off = mkcluster(disk, d, off, phys_off);
436f224f92aSccardenas 		if (phys_off == -1)
437f224f92aSccardenas 			return -1;
43807e1a8caSori 		if (phys_off < disk->clustersz)
43907e1a8caSori 			fatalx("%s: writing reserved cluster", __func__);
440f224f92aSccardenas 		if (pwrite(disk->fd, buf, sz, phys_off) != sz)
441f224f92aSccardenas 			return -1;
442f224f92aSccardenas 		off += sz;
443f224f92aSccardenas 		buf += sz;
444f224f92aSccardenas 		rem -= sz;
445f224f92aSccardenas 	}
446f224f92aSccardenas 	return len;
447f224f92aSccardenas }
448f224f92aSccardenas 
449f224f92aSccardenas static void
450f6c09be3Sreyk qc2_close(void *p, int stayopen)
451f224f92aSccardenas {
452f224f92aSccardenas 	struct qcdisk *disk;
453f224f92aSccardenas 
454f224f92aSccardenas 	disk = p;
45550bebf2cSccardenas 	if (disk->base)
456f6c09be3Sreyk 		qc2_close(disk->base, stayopen);
457f6c09be3Sreyk 	if (!stayopen)
458f224f92aSccardenas 		close(disk->fd);
45950bebf2cSccardenas 	free(disk->l1);
460f224f92aSccardenas 	free(disk);
461f224f92aSccardenas }
462f224f92aSccardenas 
463f224f92aSccardenas /*
464f224f92aSccardenas  * Translates a virtual offset into an on-disk offset.
465f224f92aSccardenas  * Returns:
466f224f92aSccardenas  * 	-1 on error
467f224f92aSccardenas  * 	 0 on 'not found'
468f224f92aSccardenas  * 	>0 on found
469f224f92aSccardenas  */
470f224f92aSccardenas static off_t
471f224f92aSccardenas xlate(struct qcdisk *disk, off_t off, int *inplace)
472f224f92aSccardenas {
473f224f92aSccardenas 	off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff;
474f224f92aSccardenas 	uint64_t buf;
475f224f92aSccardenas 
476f224f92aSccardenas 
477f224f92aSccardenas 	/*
478f224f92aSccardenas 	 * Clear out inplace flag -- xlate misses should not
479f224f92aSccardenas 	 * be flagged as updatable in place. We will still
480f224f92aSccardenas 	 * return 0 from them, but this leaves less surprises
481f224f92aSccardenas 	 * in the API.
482f224f92aSccardenas 	 */
483f224f92aSccardenas 	if (inplace)
484f224f92aSccardenas 		*inplace = 0;
485f224f92aSccardenas 	pthread_rwlock_rdlock(&disk->lock);
486f224f92aSccardenas 	if (off < 0)
487f224f92aSccardenas 		goto err;
488f224f92aSccardenas 
489f224f92aSccardenas 	l2sz = disk->clustersz / 8;
490f224f92aSccardenas 	l1off = (off / disk->clustersz) / l2sz;
491f224f92aSccardenas 	if (l1off >= disk->l1sz)
492f224f92aSccardenas 		goto err;
493f224f92aSccardenas 
494f224f92aSccardenas 	l2tab = disk->l1[l1off];
495f224f92aSccardenas 	l2tab &= ~QCOW2_INPLACE;
496f224f92aSccardenas 	if (l2tab == 0) {
497f224f92aSccardenas 		pthread_rwlock_unlock(&disk->lock);
498f224f92aSccardenas 		return 0;
499f224f92aSccardenas 	}
500f224f92aSccardenas 	l2off = (off / disk->clustersz) % l2sz;
501f224f92aSccardenas 	pread(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8);
502f224f92aSccardenas 	cluster = be64toh(buf);
503f224f92aSccardenas 	/*
504f224f92aSccardenas 	 * cluster may be 0, but all future operations don't affect
505f224f92aSccardenas 	 * the return value.
506f224f92aSccardenas 	 */
507f224f92aSccardenas 	if (inplace)
508f224f92aSccardenas 		*inplace = !!(cluster & QCOW2_INPLACE);
50907e1a8caSori 	if (cluster & QCOW2_COMPRESSED)
51007e1a8caSori 		fatalx("%s: compressed clusters unsupported", __func__);
511f224f92aSccardenas 	pthread_rwlock_unlock(&disk->lock);
512f224f92aSccardenas 	clusteroff = 0;
513f224f92aSccardenas 	cluster &= ~QCOW2_INPLACE;
514f224f92aSccardenas 	if (cluster)
515f224f92aSccardenas 		clusteroff = off % disk->clustersz;
516f224f92aSccardenas 	return cluster + clusteroff;
517f224f92aSccardenas err:
518f224f92aSccardenas 	pthread_rwlock_unlock(&disk->lock);
519f224f92aSccardenas 	return -1;
520f224f92aSccardenas }
521f224f92aSccardenas 
522f224f92aSccardenas /*
523f224f92aSccardenas  * Allocates a new cluster on disk, creating a new L2 table
524f224f92aSccardenas  * if needed. The cluster starts off with a refs of one,
525f224f92aSccardenas  * and the writable bit set.
526f224f92aSccardenas  *
527f224f92aSccardenas  * Returns -1 on error, and the physical address within the
528f224f92aSccardenas  * cluster of the write offset if it exists.
529f224f92aSccardenas  */
530f224f92aSccardenas static off_t
531f224f92aSccardenas mkcluster(struct qcdisk *disk, struct qcdisk *base, off_t off, off_t src_phys)
532f224f92aSccardenas {
533f224f92aSccardenas 	off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff, orig;
534f224f92aSccardenas 	uint64_t buf;
535f224f92aSccardenas 
536f224f92aSccardenas 	pthread_rwlock_wrlock(&disk->lock);
537f224f92aSccardenas 
538f224f92aSccardenas 	cluster = -1;
539f224f92aSccardenas 	/* L1 entries always exist */
540f224f92aSccardenas 	l2sz = disk->clustersz / 8;
541f224f92aSccardenas 	l1off = off / (disk->clustersz * l2sz);
542f224f92aSccardenas 	if (l1off >= disk->l1sz)
54307e1a8caSori 		fatalx("l1 offset outside disk");
544f224f92aSccardenas 
545f224f92aSccardenas 	disk->end = (disk->end + disk->clustersz - 1) & ~(disk->clustersz - 1);
546f224f92aSccardenas 
547f224f92aSccardenas 	l2tab = disk->l1[l1off];
548f224f92aSccardenas 	l2off = (off / disk->clustersz) % l2sz;
549f224f92aSccardenas 	/* We may need to create or clone an L2 entry to map the block */
550f224f92aSccardenas 	if (l2tab == 0 || (l2tab & QCOW2_INPLACE) == 0) {
551f224f92aSccardenas 		orig = l2tab & ~QCOW2_INPLACE;
552f224f92aSccardenas 		l2tab = disk->end;
553f224f92aSccardenas 		disk->end += disk->clustersz;
55407e1a8caSori 		if (ftruncate(disk->fd, disk->end) == -1)
55507e1a8caSori 			fatal("%s: ftruncate failed", __func__);
556f224f92aSccardenas 
557f224f92aSccardenas 		/*
558f224f92aSccardenas 		 * If we translated, found a L2 entry, but it needed to
559f224f92aSccardenas 		 * be copied, copy it.
560f224f92aSccardenas 		 */
56107e1a8caSori 		if (orig != 0)
56207e1a8caSori 			copy_cluster(disk, disk, l2tab, orig);
563f224f92aSccardenas 		/* Update l1 -- we flush it later */
564f224f92aSccardenas 		disk->l1[l1off] = l2tab | QCOW2_INPLACE;
56507e1a8caSori 		inc_refs(disk, l2tab, 1);
566f224f92aSccardenas 	}
567f224f92aSccardenas 	l2tab &= ~QCOW2_INPLACE;
568f224f92aSccardenas 
569f224f92aSccardenas 	/* Grow the disk */
570f224f92aSccardenas 	if (ftruncate(disk->fd, disk->end + disk->clustersz) < 0)
5714a1c7b02Sdv 		fatal("%s: could not grow disk", __func__);
572f224f92aSccardenas 	if (src_phys > 0)
57307e1a8caSori 		copy_cluster(disk, base, disk->end, src_phys);
574f224f92aSccardenas 	cluster = disk->end;
575f224f92aSccardenas 	disk->end += disk->clustersz;
576f224f92aSccardenas 	buf = htobe64(cluster | QCOW2_INPLACE);
57773613953Sreyk 	if (pwrite(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8) != 8)
57807e1a8caSori 		fatalx("%s: could not write cluster", __func__);
579f224f92aSccardenas 
580f224f92aSccardenas 	/* TODO: lazily sync: currently VMD doesn't close things */
581f224f92aSccardenas 	buf = htobe64(disk->l1[l1off]);
58273613953Sreyk 	if (pwrite(disk->fd, &buf, sizeof(buf), disk->l1off + 8 * l1off) != 8)
58307e1a8caSori 		fatalx("%s: could not write l1", __func__);
58407e1a8caSori 	inc_refs(disk, cluster, 1);
585f224f92aSccardenas 
586f224f92aSccardenas 	pthread_rwlock_unlock(&disk->lock);
587f224f92aSccardenas 	clusteroff = off % disk->clustersz;
58807e1a8caSori 	if (cluster + clusteroff < disk->clustersz)
58907e1a8caSori 		fatalx("write would clobber header");
590f224f92aSccardenas 	return cluster + clusteroff;
591f224f92aSccardenas }
592f224f92aSccardenas 
593f224f92aSccardenas /* Copies a cluster containing src to dst. Src and dst need not be aligned. */
59407e1a8caSori static void
595f224f92aSccardenas copy_cluster(struct qcdisk *disk, struct qcdisk *base, off_t dst, off_t src)
596f224f92aSccardenas {
597f224f92aSccardenas 	char *scratch;
598f224f92aSccardenas 
599fb6f09faSderaadt 	scratch = malloc(disk->clustersz);
600f224f92aSccardenas 	if (!scratch)
60107e1a8caSori 		fatal("out of memory");
602f224f92aSccardenas 	src &= ~(disk->clustersz - 1);
603f224f92aSccardenas 	dst &= ~(disk->clustersz - 1);
604f224f92aSccardenas 	if (pread(base->fd, scratch, disk->clustersz, src) == -1)
60507e1a8caSori 		fatal("%s: could not read cluster", __func__);
606f224f92aSccardenas 	if (pwrite(disk->fd, scratch, disk->clustersz, dst) == -1)
60707e1a8caSori 		fatal("%s: could not write cluster", __func__);
608fb6f09faSderaadt 	free(scratch);
609f224f92aSccardenas }
610f224f92aSccardenas 
61107e1a8caSori static void
612f224f92aSccardenas inc_refs(struct qcdisk *disk, off_t off, int newcluster)
613f224f92aSccardenas {
614f224f92aSccardenas 	off_t l1off, l1idx, l2idx, l2cluster;
615f224f92aSccardenas 	size_t nper;
616f224f92aSccardenas 	uint16_t refs;
617f224f92aSccardenas 	uint64_t buf;
618f224f92aSccardenas 
619f224f92aSccardenas 	off &= ~QCOW2_INPLACE;
620f224f92aSccardenas 	nper = disk->clustersz / 2;
621f224f92aSccardenas 	l1idx = (off / disk->clustersz) / nper;
622f224f92aSccardenas 	l2idx = (off / disk->clustersz) % nper;
623f224f92aSccardenas 	l1off = disk->refoff + 8 * l1idx;
62473613953Sreyk 	if (pread(disk->fd, &buf, sizeof(buf), l1off) != 8)
62507e1a8caSori 		fatal("could not read refs");
626f224f92aSccardenas 
627f224f92aSccardenas 	l2cluster = be64toh(buf);
628f224f92aSccardenas 	if (l2cluster == 0) {
629f224f92aSccardenas 		l2cluster = disk->end;
630f224f92aSccardenas 		disk->end += disk->clustersz;
63107e1a8caSori 		if (ftruncate(disk->fd, disk->end) < 0)
63207e1a8caSori 			fatal("%s: failed to allocate ref block", __func__);
633f224f92aSccardenas 		buf = htobe64(l2cluster);
63407e1a8caSori 		if (pwrite(disk->fd, &buf, sizeof(buf), l1off) != 8)
63507e1a8caSori 			fatal("%s: failed to write ref block", __func__);
636f224f92aSccardenas 	}
637f224f92aSccardenas 
638f224f92aSccardenas 	refs = 1;
639f224f92aSccardenas 	if (!newcluster) {
64073613953Sreyk 		if (pread(disk->fd, &refs, sizeof(refs),
64173613953Sreyk 		    l2cluster + 2 * l2idx) != 2)
64207e1a8caSori 			fatal("could not read ref cluster");
643f224f92aSccardenas 		refs = be16toh(refs) + 1;
644f224f92aSccardenas 	}
645f224f92aSccardenas 	refs = htobe16(refs);
64607e1a8caSori 	if (pwrite(disk->fd, &refs, sizeof(refs), l2cluster + 2 * l2idx) != 2)
64707e1a8caSori 		fatal("%s: could not write ref block", __func__);
648f224f92aSccardenas }
649f224f92aSccardenas 
65062df93eeSreyk /*
65162df93eeSreyk  * virtio_qcow2_create
65262df93eeSreyk  *
65362df93eeSreyk  * Create an empty qcow2 imagefile with the specified path and size.
65462df93eeSreyk  *
65562df93eeSreyk  * Parameters:
65662df93eeSreyk  *  imgfile_path: path to the image file to create
6572eec0843Sdv  *  imgsize     : size of the image file to create (in bytes)
65862df93eeSreyk  *
65962df93eeSreyk  * Return:
66062df93eeSreyk  *  EEXIST: The requested image file already exists
66162df93eeSreyk  *  0     : Image file successfully created
66262df93eeSreyk  *  Exxxx : Various other Exxxx errno codes due to other I/O errors
66362df93eeSreyk  */
66462df93eeSreyk int
66562df93eeSreyk virtio_qcow2_create(const char *imgfile_path,
666ead1b146Sdv     const char *base_path, uint64_t disksz)
66762df93eeSreyk {
668702b4317Sdv 	struct qcheader hdr, basehdr;
66962df93eeSreyk 	int fd, ret;
67062df93eeSreyk 	ssize_t base_len;
671ead1b146Sdv 	uint64_t l1sz, refsz, initsz, clustersz;
67262df93eeSreyk 	uint64_t l1off, refoff, v, i, l1entrysz, refentrysz;
67362df93eeSreyk 	uint16_t refs;
67462df93eeSreyk 
67562df93eeSreyk 	if (base_path) {
67662df93eeSreyk 		fd = open(base_path, O_RDONLY);
67762df93eeSreyk 		if (read(fd, &basehdr, sizeof(basehdr)) != sizeof(basehdr))
6784a1c7b02Sdv 			errx(1, "failure to read base image header");
67962df93eeSreyk 		close(fd);
68062df93eeSreyk 		if (strncmp(basehdr.magic,
68162df93eeSreyk 		    VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0)
68262df93eeSreyk 			errx(1, "base image is not a qcow2 file");
68362df93eeSreyk 		if (!disksz)
68462df93eeSreyk 			disksz = betoh64(basehdr.disksz);
68562df93eeSreyk 		else if (disksz != betoh64(basehdr.disksz))
68662df93eeSreyk 			errx(1, "base size does not match requested size");
68762df93eeSreyk 	}
68862df93eeSreyk 	if (!base_path && !disksz)
68962df93eeSreyk 		errx(1, "missing disk size");
69062df93eeSreyk 
69162df93eeSreyk 	clustersz = (1<<16);
69262df93eeSreyk 	l1off = ALIGNSZ(sizeof(hdr), clustersz);
69362df93eeSreyk 
69462df93eeSreyk 	l1entrysz = clustersz * clustersz / 8;
69562df93eeSreyk 	l1sz = (disksz + l1entrysz - 1) / l1entrysz;
69662df93eeSreyk 
69762df93eeSreyk 	refoff = ALIGNSZ(l1off + 8*l1sz, clustersz);
69862df93eeSreyk 	refentrysz = clustersz * clustersz * clustersz / 2;
69962df93eeSreyk 	refsz = (disksz + refentrysz - 1) / refentrysz;
70062df93eeSreyk 
70162df93eeSreyk 	initsz = ALIGNSZ(refoff + refsz*clustersz, clustersz);
70262df93eeSreyk 	base_len = base_path ? strlen(base_path) : 0;
70362df93eeSreyk 
70462df93eeSreyk 	memcpy(hdr.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW));
70562df93eeSreyk 	hdr.version		= htobe32(3);
70662df93eeSreyk 	hdr.backingoff		= htobe64(base_path ? sizeof(hdr) : 0);
70762df93eeSreyk 	hdr.backingsz		= htobe32(base_len);
70862df93eeSreyk 	hdr.clustershift	= htobe32(16);
70962df93eeSreyk 	hdr.disksz		= htobe64(disksz);
71062df93eeSreyk 	hdr.cryptmethod		= htobe32(0);
71162df93eeSreyk 	hdr.l1sz		= htobe32(l1sz);
71262df93eeSreyk 	hdr.l1off		= htobe64(l1off);
71362df93eeSreyk 	hdr.refoff		= htobe64(refoff);
71462df93eeSreyk 	hdr.refsz		= htobe32(refsz);
71562df93eeSreyk 	hdr.snapcount		= htobe32(0);
71662df93eeSreyk 	hdr.snapsz		= htobe64(0);
71762df93eeSreyk 	hdr.incompatfeatures	= htobe64(0);
71862df93eeSreyk 	hdr.compatfeatures	= htobe64(0);
71962df93eeSreyk 	hdr.autoclearfeatures	= htobe64(0);
72062df93eeSreyk 	hdr.reforder		= htobe32(4);
72162df93eeSreyk 	hdr.headersz		= htobe32(sizeof(hdr));
72262df93eeSreyk 
72362df93eeSreyk 	/* Refuse to overwrite an existing image */
72462df93eeSreyk 	fd = open(imgfile_path, O_RDWR | O_CREAT | O_TRUNC | O_EXCL,
72562df93eeSreyk 	    S_IRUSR | S_IWUSR);
72662df93eeSreyk 	if (fd == -1)
72762df93eeSreyk 		return (errno);
72862df93eeSreyk 
72962df93eeSreyk 	/* Write out the header */
73062df93eeSreyk 	if (write(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
73162df93eeSreyk 		goto error;
73262df93eeSreyk 
73362df93eeSreyk 	/* Add the base image */
73462df93eeSreyk 	if (base_path && write(fd, base_path, base_len) != base_len)
73562df93eeSreyk 		goto error;
73662df93eeSreyk 
73762df93eeSreyk 	/* Extend to desired size, and add one refcount cluster */
73862df93eeSreyk 	if (ftruncate(fd, (off_t)initsz + clustersz) == -1)
73962df93eeSreyk 		goto error;
74062df93eeSreyk 
74162df93eeSreyk 	/*
74262df93eeSreyk 	 * Paranoia: if our disk image takes more than one cluster
74362df93eeSreyk 	 * to refcount the initial image, fail.
74462df93eeSreyk 	 */
74562df93eeSreyk 	if (initsz/clustersz > clustersz/2) {
74662df93eeSreyk 		errno = ERANGE;
74762df93eeSreyk 		goto error;
74862df93eeSreyk 	}
74962df93eeSreyk 
75062df93eeSreyk 	/* Add a refcount block, and refcount ourselves. */
75162df93eeSreyk 	v = htobe64(initsz);
75262df93eeSreyk 	if (pwrite(fd, &v, 8, refoff) != 8)
75362df93eeSreyk 		goto error;
75462df93eeSreyk 	for (i = 0; i < initsz/clustersz + 1; i++) {
75562df93eeSreyk 		refs = htobe16(1);
75662df93eeSreyk 		if (pwrite(fd, &refs, 2, initsz + 2*i) != 2)
75762df93eeSreyk 			goto error;
75862df93eeSreyk 	}
75962df93eeSreyk 
76062df93eeSreyk 	ret = close(fd);
76162df93eeSreyk 	return (ret);
76262df93eeSreyk error:
76362df93eeSreyk 	ret = errno;
76462df93eeSreyk 	close(fd);
76562df93eeSreyk 	unlink(imgfile_path);
76662df93eeSreyk 	return (errno);
76762df93eeSreyk }
768