xref: /openbsd-src/usr.sbin/vmd/vm.c (revision d12ef5f337ab466012c0d0342965e310661b355a)
1*d12ef5f3Sclaudio /*	$OpenBSD: vm.c,v 1.110 2024/11/21 13:25:30 claudio Exp $	*/
21e1977eeSreyk 
31e1977eeSreyk /*
41e1977eeSreyk  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
51e1977eeSreyk  *
61e1977eeSreyk  * Permission to use, copy, modify, and distribute this software for any
71e1977eeSreyk  * purpose with or without fee is hereby granted, provided that the above
81e1977eeSreyk  * copyright notice and this permission notice appear in all copies.
91e1977eeSreyk  *
101e1977eeSreyk  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
111e1977eeSreyk  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
121e1977eeSreyk  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
131e1977eeSreyk  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
141e1977eeSreyk  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
151e1977eeSreyk  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
161e1977eeSreyk  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
171e1977eeSreyk  */
181e1977eeSreyk 
19ad00e8c1Sdv #include <sys/param.h>	/* PAGE_SIZE, MAXCOMLEN */
201e1977eeSreyk #include <sys/types.h>
211e1977eeSreyk #include <sys/ioctl.h>
221e1977eeSreyk #include <sys/mman.h>
23fbbcf6cdSdv #include <sys/resource.h>
241e1977eeSreyk 
25ba66f564Sdv #include <dev/vmm/vmm.h>
261e1977eeSreyk 
271e1977eeSreyk #include <errno.h>
281e1977eeSreyk #include <event.h>
291e1977eeSreyk #include <fcntl.h>
301e1977eeSreyk #include <imsg.h>
311e1977eeSreyk #include <poll.h>
321e1977eeSreyk #include <pthread.h>
33ad00e8c1Sdv #include <pthread_np.h>
341e1977eeSreyk #include <stdio.h>
351e1977eeSreyk #include <stdlib.h>
361e1977eeSreyk #include <string.h>
371e1977eeSreyk #include <unistd.h>
381e1977eeSreyk #include <util.h>
391e1977eeSreyk 
406eb4c859Sdv #include "atomicio.h"
416eb4c859Sdv #include "pci.h"
426eb4c859Sdv #include "virtio.h"
436eb4c859Sdv #include "vmd.h"
44b1ba8534Sdv 
45bee70036Sdv #define MMIO_NOTYET 0
46bee70036Sdv 
4724386e31Sdv static int run_vm(struct vmop_create_params *, struct vcpu_reg_state *);
48c4fd4c5bSdv static void vm_dispatch_vmm(int, short, void *);
49c4fd4c5bSdv static void *event_thread(void *);
50c4fd4c5bSdv static void *vcpu_run_loop(void *);
5173a98491Sdv static int vmm_create_vm(struct vmd_vm *);
52c4fd4c5bSdv static int alloc_guest_mem(struct vmd_vm *);
5373a98491Sdv static int send_vm(int, struct vmd_vm *);
5473a98491Sdv static int dump_vmr(int , struct vm_mem_range *);
5573a98491Sdv static int dump_mem(int, struct vmd_vm *);
56c4fd4c5bSdv static void restore_vmr(int, struct vm_mem_range *);
57c4fd4c5bSdv static void restore_mem(int, struct vm_create_params *);
58c4fd4c5bSdv static int restore_vm_params(int, struct vm_create_params *);
5973a98491Sdv static void pause_vm(struct vmd_vm *);
6073a98491Sdv static void unpause_vm(struct vmd_vm *);
61c4fd4c5bSdv static int start_vm(struct vmd_vm *, int);
621e1977eeSreyk 
631e1977eeSreyk int con_fd;
641e1977eeSreyk struct vmd_vm *current_vm;
651e1977eeSreyk 
661e1977eeSreyk extern struct vmd *env;
671e1977eeSreyk 
681e1977eeSreyk extern char *__progname;
691e1977eeSreyk 
701e1977eeSreyk pthread_mutex_t threadmutex;
711e1977eeSreyk pthread_cond_t threadcond;
721e1977eeSreyk 
731e1977eeSreyk pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM];
741e1977eeSreyk pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM];
75ad9e848cSpd pthread_barrier_t vm_pause_barrier;
76548054a9Spd pthread_cond_t vcpu_unpause_cond[VMM_MAX_VCPUS_PER_VM];
77548054a9Spd pthread_mutex_t vcpu_unpause_mtx[VMM_MAX_VCPUS_PER_VM];
785195cf3eSdv 
795195cf3eSdv pthread_mutex_t vm_mtx;
801e1977eeSreyk uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM];
811e1977eeSreyk uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM];
821e1977eeSreyk 
831e1977eeSreyk /*
8424386e31Sdv  * vm_main
8524386e31Sdv  *
8624386e31Sdv  * Primary entrypoint for launching a vm. Does not return.
8724386e31Sdv  *
8824386e31Sdv  * fd: file descriptor for communicating with vmm process.
893c817da7Sdv  * fd_vmm: file descriptor for communicating with vmm(4) device
9024386e31Sdv  */
9124386e31Sdv void
92b3bc6112Sdv vm_main(int fd, int fd_vmm)
9324386e31Sdv {
9424386e31Sdv 	struct vm_create_params	*vcp = NULL;
9524386e31Sdv 	struct vmd_vm		 vm;
9624386e31Sdv 	size_t			 sz = 0;
9724386e31Sdv 	int			 ret = 0;
9824386e31Sdv 
9924386e31Sdv 	/*
100b3bc6112Sdv 	 * The vm process relies on global state. Set the fd for /dev/vmm.
101b3bc6112Sdv 	 */
102b3bc6112Sdv 	env->vmd_fd = fd_vmm;
103b3bc6112Sdv 
104b3bc6112Sdv 	/*
10524386e31Sdv 	 * We aren't root, so we can't chroot(2). Use unveil(2) instead.
10624386e31Sdv 	 */
1073481ecdfSdv 	if (unveil(env->argv0, "x") == -1)
1083481ecdfSdv 		fatal("unveil %s", env->argv0);
10924386e31Sdv 	if (unveil(NULL, NULL) == -1)
11024386e31Sdv 		fatal("unveil lock");
11124386e31Sdv 
11224386e31Sdv 	/*
11324386e31Sdv 	 * pledge in the vm processes:
11424386e31Sdv 	 * stdio - for malloc and basic I/O including events.
11524386e31Sdv 	 * vmm - for the vmm ioctls and operations.
1163481ecdfSdv 	 * proc exec - fork/exec for launching devices.
11724386e31Sdv 	 * recvfd - for vm send/recv and sending fd to devices.
11824386e31Sdv 	 */
1193c817da7Sdv 	if (pledge("stdio vmm proc exec recvfd", NULL) == -1)
12024386e31Sdv 		fatal("pledge");
12124386e31Sdv 
12224386e31Sdv 	/* Receive our vm configuration. */
12324386e31Sdv 	memset(&vm, 0, sizeof(vm));
12424386e31Sdv 	sz = atomicio(read, fd, &vm, sizeof(vm));
12524386e31Sdv 	if (sz != sizeof(vm)) {
12624386e31Sdv 		log_warnx("failed to receive start message");
12724386e31Sdv 		_exit(EIO);
12824386e31Sdv 	}
12924386e31Sdv 
13024386e31Sdv 	/* Update process with the vm name. */
13124386e31Sdv 	vcp = &vm.vm_params.vmc_params;
13224386e31Sdv 	setproctitle("%s", vcp->vcp_name);
13308d0da61Sdv 	log_procinit("vm/%s", vcp->vcp_name);
13424386e31Sdv 
1352272e586Sdv 	/* Receive the local prefix settings. */
1362272e586Sdv 	sz = atomicio(read, fd, &env->vmd_cfg.cfg_localprefix,
1372272e586Sdv 	    sizeof(env->vmd_cfg.cfg_localprefix));
1382272e586Sdv 	if (sz != sizeof(env->vmd_cfg.cfg_localprefix)) {
1392272e586Sdv 		log_warnx("failed to receive local prefix");
1402272e586Sdv 		_exit(EIO);
1412272e586Sdv 	}
1422272e586Sdv 
14324386e31Sdv 	/*
14424386e31Sdv 	 * We need, at minimum, a vm_kernel fd to boot a vm. This is either a
14524386e31Sdv 	 * kernel or a BIOS image.
14624386e31Sdv 	 */
147b848b186Sdv 	if (!(vm.vm_state & VM_STATE_RECEIVED)) {
148b848b186Sdv 		if (vm.vm_kernel == -1) {
149b848b186Sdv 			log_warnx("%s: failed to receive boot fd",
150b848b186Sdv 			    vcp->vcp_name);
15124386e31Sdv 			_exit(EINVAL);
15224386e31Sdv 		}
153b848b186Sdv 	}
15424386e31Sdv 
155f4b47ae8Sbluhm 	if (vcp->vcp_sev && env->vmd_psp_fd < 0) {
156f4b47ae8Sbluhm 		log_warnx("%s not available", PSP_NODE);
157f4b47ae8Sbluhm 		_exit(EINVAL);
158f4b47ae8Sbluhm 	}
159f4b47ae8Sbluhm 
16024386e31Sdv 	ret = start_vm(&vm, fd);
16124386e31Sdv 	_exit(ret);
16224386e31Sdv }
16324386e31Sdv 
16424386e31Sdv /*
1651e1977eeSreyk  * start_vm
1661e1977eeSreyk  *
1671e1977eeSreyk  * After forking a new VM process, starts the new VM with the creation
1681e1977eeSreyk  * parameters supplied (in the incoming vm->vm_params field). This
1691e1977eeSreyk  * function performs a basic sanity check on the incoming parameters
1701e1977eeSreyk  * and then performs the following steps to complete the creation of the VM:
1711e1977eeSreyk  *
1721e1977eeSreyk  * 1. validates and create the new VM
1731e1977eeSreyk  * 2. opens the imsg control channel to the parent and drops more privilege
1743a50f0a9Sjmc  * 3. drops additional privileges by calling pledge(2)
1751e1977eeSreyk  * 4. loads the kernel from the disk image or file descriptor
1761e1977eeSreyk  * 5. runs the VM's VCPU loops.
1771e1977eeSreyk  *
1781e1977eeSreyk  * Parameters:
1791e1977eeSreyk  *  vm: The VM data structure that is including the VM create parameters.
1801e1977eeSreyk  *  fd: The imsg socket that is connected to the parent process.
1811e1977eeSreyk  *
1821e1977eeSreyk  * Return values:
1831e1977eeSreyk  *  0: success
1841e1977eeSreyk  *  !0 : failure - typically an errno indicating the source of the failure
1851e1977eeSreyk  */
1861e1977eeSreyk int
1871e1977eeSreyk start_vm(struct vmd_vm *vm, int fd)
1881e1977eeSreyk {
189f6c09be3Sreyk 	struct vmop_create_params *vmc = &vm->vm_params;
190f6c09be3Sreyk 	struct vm_create_params	*vcp = &vmc->vmc_params;
1911e1977eeSreyk 	struct vcpu_reg_state	 vrs;
192d489aa7eSdv 	int			 nicfds[VM_MAX_NICS_PER_VM];
1931e1977eeSreyk 	int			 ret;
1941e1977eeSreyk 	size_t			 i;
195eed20f3bSpd 	struct vm_rwregs_params  vrp;
1961e1977eeSreyk 
19724386e31Sdv 	/*
19824386e31Sdv 	 * We first try to initialize and allocate memory before bothering
19924386e31Sdv 	 * vmm(4) with a request to create a new vm.
20024386e31Sdv 	 */
20119700f36Sjasper 	if (!(vm->vm_state & VM_STATE_RECEIVED))
2021e1977eeSreyk 		create_memory_map(vcp);
203eed20f3bSpd 
2043481ecdfSdv 	ret = alloc_guest_mem(vm);
2051e1977eeSreyk 	if (ret) {
206fbbcf6cdSdv 		struct rlimit lim;
207fbbcf6cdSdv 		char buf[FMT_SCALED_STRSIZE];
208fbbcf6cdSdv 		if (ret == ENOMEM && getrlimit(RLIMIT_DATA, &lim) == 0) {
209fbbcf6cdSdv 			if (fmt_scaled(lim.rlim_cur, buf) == 0)
210fbbcf6cdSdv 				fatalx("could not allocate guest memory (data "
211fbbcf6cdSdv 				    "limit is %s)", buf);
212fbbcf6cdSdv 		}
2131e1977eeSreyk 		errno = ret;
21424386e31Sdv 		log_warn("could not allocate guest memory");
21524386e31Sdv 		return (ret);
2161e1977eeSreyk 	}
2171e1977eeSreyk 
21824386e31Sdv 	/* We've allocated guest memory, so now create the vm in vmm(4). */
21973a98491Sdv 	ret = vmm_create_vm(vm);
2201e1977eeSreyk 	if (ret) {
22124386e31Sdv 		/* Let the vmm process know we failed by sending a 0 vm id. */
22224386e31Sdv 		vcp->vcp_id = 0;
22324386e31Sdv 		atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id));
22424386e31Sdv 		return (ret);
2251e1977eeSreyk 	}
2261e1977eeSreyk 
227f4b47ae8Sbluhm 	/* Setup SEV. */
228f4b47ae8Sbluhm 	ret = sev_init(vm);
229f4b47ae8Sbluhm 	if (ret) {
230f4b47ae8Sbluhm 		log_warnx("could not initialize SEV");
231f4b47ae8Sbluhm 		return (ret);
232f4b47ae8Sbluhm 	}
233f4b47ae8Sbluhm 
23424386e31Sdv 	/*
23524386e31Sdv 	 * Some of vmd currently relies on global state (current_vm, con_fd).
23624386e31Sdv 	 */
23724386e31Sdv 	current_vm = vm;
23824386e31Sdv 	con_fd = vm->vm_tty;
23924386e31Sdv 	if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1) {
24024386e31Sdv 		log_warn("failed to set nonblocking mode on console");
24124386e31Sdv 		return (1);
24224386e31Sdv 	}
24324386e31Sdv 
24424386e31Sdv 	/*
24524386e31Sdv 	 * We now let the vmm process know we were successful by sending it our
24624386e31Sdv 	 * vmm(4) assigned vm id.
24724386e31Sdv 	 */
24824386e31Sdv 	if (atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) !=
24924386e31Sdv 	    sizeof(vcp->vcp_id)) {
25024386e31Sdv 		log_warn("failed to send created vm id to vmm process");
25124386e31Sdv 		return (1);
25224386e31Sdv 	}
25324386e31Sdv 
25424386e31Sdv 	/* Prepare either our boot image or receive an existing vm to launch. */
25519700f36Sjasper 	if (vm->vm_state & VM_STATE_RECEIVED) {
256fbbcf6cdSdv 		ret = atomicio(read, vm->vm_receive_fd, &vrp, sizeof(vrp));
257fbbcf6cdSdv 		if (ret != sizeof(vrp))
258eed20f3bSpd 			fatal("received incomplete vrp - exiting");
259eed20f3bSpd 		vrs = vrp.vrwp_regs;
260c4fd4c5bSdv 	} else if (load_firmware(vm, &vrs))
261c4fd4c5bSdv 		fatalx("failed to load kernel or firmware image");
2621e1977eeSreyk 
2631e1977eeSreyk 	if (vm->vm_kernel != -1)
26424386e31Sdv 		close_fd(vm->vm_kernel);
2651e1977eeSreyk 
26624386e31Sdv 	/* Initialize our mutexes. */
26724386e31Sdv 	ret = pthread_mutex_init(&threadmutex, NULL);
26824386e31Sdv 	if (ret) {
26924386e31Sdv 		log_warn("%s: could not initialize thread state mutex",
27024386e31Sdv 		    __func__);
27124386e31Sdv 		return (ret);
27224386e31Sdv 	}
27324386e31Sdv 	ret = pthread_cond_init(&threadcond, NULL);
27424386e31Sdv 	if (ret) {
27524386e31Sdv 		log_warn("%s: could not initialize thread state "
27624386e31Sdv 		    "condition variable", __func__);
27724386e31Sdv 		return (ret);
27824386e31Sdv 	}
2795195cf3eSdv 	ret = pthread_mutex_init(&vm_mtx, NULL);
2805195cf3eSdv 	if (ret) {
2815195cf3eSdv 		log_warn("%s: could not initialize vm state mutex",
2825195cf3eSdv 		    __func__);
2835195cf3eSdv 		return (ret);
2845195cf3eSdv 	}
2851e1977eeSreyk 
2865195cf3eSdv 	/* Lock thread mutex now. It's unlocked when waiting on threadcond. */
2875195cf3eSdv 	mutex_lock(&threadmutex);
2881e1977eeSreyk 
28924386e31Sdv 	/*
29024386e31Sdv 	 * Finalize our communication socket with the vmm process. From here
29124386e31Sdv 	 * onwards, communication with the vmm process is event-based.
29224386e31Sdv 	 */
2939d3767a2Smlarkin 	event_init();
29424386e31Sdv 	if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1)
29524386e31Sdv 		fatal("setup vm pipe");
29624386e31Sdv 
29724386e31Sdv 	/*
29824386e31Sdv 	 * Initialize or restore our emulated hardware.
29924386e31Sdv 	 */
30024386e31Sdv 	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
30124386e31Sdv 		nicfds[i] = vm->vm_ifs[i].vif_fd;
3021e1977eeSreyk 
30319700f36Sjasper 	if (vm->vm_state & VM_STATE_RECEIVED) {
3043481ecdfSdv 		restore_mem(vm->vm_receive_fd, vcp);
305eed20f3bSpd 		restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds,
30695ab188fSccardenas 		    vm->vm_disks, vm->vm_cdrom);
307cbd2a590Spd 		if (restore_vm_params(vm->vm_receive_fd, vcp))
308cbd2a590Spd 			fatal("restore vm params failed");
30973a98491Sdv 		unpause_vm(vm);
31024386e31Sdv 	} else
31124386e31Sdv 		init_emulated_hw(vmc, vm->vm_cdrom, vm->vm_disks, nicfds);
312eed20f3bSpd 
3133481ecdfSdv 	/* Drop privleges further before starting the vcpu run loop(s). */
3143481ecdfSdv 	if (pledge("stdio vmm recvfd", NULL) == -1)
3153481ecdfSdv 		fatal("pledge");
3163481ecdfSdv 
31724386e31Sdv 	/*
31824386e31Sdv 	 * Execute the vcpu run loop(s) for this VM.
31924386e31Sdv 	 */
32024386e31Sdv 	ret = run_vm(&vm->vm_params, &vrs);
3211e1977eeSreyk 
322f4b47ae8Sbluhm 	/* Shutdown SEV. */
323f4b47ae8Sbluhm 	if (sev_shutdown(vm))
324f4b47ae8Sbluhm 		log_warnx("%s: could not shutdown SEV", __func__);
325f4b47ae8Sbluhm 
32650bebf2cSccardenas 	/* Ensure that any in-flight data is written back */
32750bebf2cSccardenas 	virtio_shutdown(vm);
32850bebf2cSccardenas 
3291e1977eeSreyk 	return (ret);
3301e1977eeSreyk }
3311e1977eeSreyk 
3321e1977eeSreyk /*
3331e1977eeSreyk  * vm_dispatch_vmm
3341e1977eeSreyk  *
3351e1977eeSreyk  * imsg callback for messages that are received from the vmm parent process.
3361e1977eeSreyk  */
3371e1977eeSreyk void
3381e1977eeSreyk vm_dispatch_vmm(int fd, short event, void *arg)
3391e1977eeSreyk {
3401e1977eeSreyk 	struct vmd_vm		*vm = arg;
34152e954a3Spd 	struct vmop_result	 vmr;
34297f33f1dSdv 	struct vmop_addr_result	 var;
3431e1977eeSreyk 	struct imsgev		*iev = &vm->vm_iev;
3441e1977eeSreyk 	struct imsgbuf		*ibuf = &iev->ibuf;
3451e1977eeSreyk 	struct imsg		 imsg;
3461e1977eeSreyk 	ssize_t			 n;
3471e1977eeSreyk 	int			 verbose;
3481e1977eeSreyk 
3491e1977eeSreyk 	if (event & EV_READ) {
350*d12ef5f3Sclaudio 		if ((n = imsgbuf_read(ibuf)) == -1)
351dd7efffeSclaudio 			fatal("%s: imsgbuf_read", __func__);
3521e1977eeSreyk 		if (n == 0)
3531e1977eeSreyk 			_exit(0);
3541e1977eeSreyk 	}
3551e1977eeSreyk 
3561e1977eeSreyk 	if (event & EV_WRITE) {
357dd7efffeSclaudio 		if (imsgbuf_write(ibuf) == -1) {
358c1aa9554Sclaudio 			if (errno == EPIPE)
3591e1977eeSreyk 				_exit(0);
360dd7efffeSclaudio 			fatal("%s: imsgbuf_write fd %d", __func__, ibuf->fd);
361c1aa9554Sclaudio 		}
3621e1977eeSreyk 	}
3631e1977eeSreyk 
3641e1977eeSreyk 	for (;;) {
3651e1977eeSreyk 		if ((n = imsg_get(ibuf, &imsg)) == -1)
3661e1977eeSreyk 			fatal("%s: imsg_get", __func__);
3671e1977eeSreyk 		if (n == 0)
3681e1977eeSreyk 			break;
3691e1977eeSreyk 
3701e1977eeSreyk #if DEBUG > 1
3711e1977eeSreyk 		log_debug("%s: got imsg %d from %s",
3721e1977eeSreyk 		    __func__, imsg.hdr.type,
3731e1977eeSreyk 		    vm->vm_params.vmc_params.vcp_name);
3741e1977eeSreyk #endif
3751e1977eeSreyk 
3761e1977eeSreyk 		switch (imsg.hdr.type) {
3771e1977eeSreyk 		case IMSG_CTL_VERBOSE:
3781e1977eeSreyk 			IMSG_SIZE_CHECK(&imsg, &verbose);
3791e1977eeSreyk 			memcpy(&verbose, imsg.data, sizeof(verbose));
3801e1977eeSreyk 			log_setverbose(verbose);
38108d0da61Sdv 			virtio_broadcast_imsg(vm, IMSG_CTL_VERBOSE, &verbose,
38208d0da61Sdv 			    sizeof(verbose));
3831e1977eeSreyk 			break;
3841e1977eeSreyk 		case IMSG_VMDOP_VM_SHUTDOWN:
3851e1977eeSreyk 			if (vmmci_ctl(VMMCI_SHUTDOWN) == -1)
3861e1977eeSreyk 				_exit(0);
3871e1977eeSreyk 			break;
3881e1977eeSreyk 		case IMSG_VMDOP_VM_REBOOT:
3891e1977eeSreyk 			if (vmmci_ctl(VMMCI_REBOOT) == -1)
3901e1977eeSreyk 				_exit(0);
3911e1977eeSreyk 			break;
39252e954a3Spd 		case IMSG_VMDOP_PAUSE_VM:
39352e954a3Spd 			vmr.vmr_result = 0;
39452e954a3Spd 			vmr.vmr_id = vm->vm_vmid;
39573a98491Sdv 			pause_vm(vm);
39652e954a3Spd 			imsg_compose_event(&vm->vm_iev,
39752e954a3Spd 			    IMSG_VMDOP_PAUSE_VM_RESPONSE,
39852e954a3Spd 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
39952e954a3Spd 			    sizeof(vmr));
40052e954a3Spd 			break;
40152e954a3Spd 		case IMSG_VMDOP_UNPAUSE_VM:
40252e954a3Spd 			vmr.vmr_result = 0;
40352e954a3Spd 			vmr.vmr_id = vm->vm_vmid;
40473a98491Sdv 			unpause_vm(vm);
40552e954a3Spd 			imsg_compose_event(&vm->vm_iev,
40652e954a3Spd 			    IMSG_VMDOP_UNPAUSE_VM_RESPONSE,
40752e954a3Spd 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
40852e954a3Spd 			    sizeof(vmr));
40952e954a3Spd 			break;
410eed20f3bSpd 		case IMSG_VMDOP_SEND_VM_REQUEST:
411eed20f3bSpd 			vmr.vmr_id = vm->vm_vmid;
41253027660Sclaudio 			vmr.vmr_result = send_vm(imsg_get_fd(&imsg), vm);
413eed20f3bSpd 			imsg_compose_event(&vm->vm_iev,
414eed20f3bSpd 			    IMSG_VMDOP_SEND_VM_RESPONSE,
415eed20f3bSpd 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
416eed20f3bSpd 			    sizeof(vmr));
417a31b2e6bSpd 			if (!vmr.vmr_result) {
418dd7efffeSclaudio 				imsgbuf_flush(&current_vm->vm_iev.ibuf);
419a31b2e6bSpd 				_exit(0);
420a31b2e6bSpd 			}
421eed20f3bSpd 			break;
42297f33f1dSdv 		case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE:
42397f33f1dSdv 			IMSG_SIZE_CHECK(&imsg, &var);
42497f33f1dSdv 			memcpy(&var, imsg.data, sizeof(var));
42597f33f1dSdv 
42697f33f1dSdv 			log_debug("%s: received tap addr %s for nic %d",
42797f33f1dSdv 			    vm->vm_params.vmc_params.vcp_name,
42897f33f1dSdv 			    ether_ntoa((void *)var.var_addr), var.var_nic_idx);
42997f33f1dSdv 
43097f33f1dSdv 			vionet_set_hostmac(vm, var.var_nic_idx, var.var_addr);
43197f33f1dSdv 			break;
4321e1977eeSreyk 		default:
4331e1977eeSreyk 			fatalx("%s: got invalid imsg %d from %s",
4341e1977eeSreyk 			    __func__, imsg.hdr.type,
4351e1977eeSreyk 			    vm->vm_params.vmc_params.vcp_name);
4361e1977eeSreyk 		}
4371e1977eeSreyk 		imsg_free(&imsg);
4381e1977eeSreyk 	}
4391e1977eeSreyk 	imsg_event_add(iev);
4401e1977eeSreyk }
4411e1977eeSreyk 
4421e1977eeSreyk /*
4437d0a6c3dSmlarkin  * vm_shutdown
4443320a88dSreyk  *
4453320a88dSreyk  * Tell the vmm parent process to shutdown or reboot the VM and exit.
4463320a88dSreyk  */
4473320a88dSreyk __dead void
4483320a88dSreyk vm_shutdown(unsigned int cmd)
4493320a88dSreyk {
4503320a88dSreyk 	switch (cmd) {
4513320a88dSreyk 	case VMMCI_NONE:
4523320a88dSreyk 	case VMMCI_SHUTDOWN:
4535bbb2f6eSreyk 		(void)imsg_compose_event(&current_vm->vm_iev,
4545bbb2f6eSreyk 		    IMSG_VMDOP_VM_SHUTDOWN, 0, 0, -1, NULL, 0);
4553320a88dSreyk 		break;
4563320a88dSreyk 	case VMMCI_REBOOT:
4575bbb2f6eSreyk 		(void)imsg_compose_event(&current_vm->vm_iev,
4585bbb2f6eSreyk 		    IMSG_VMDOP_VM_REBOOT, 0, 0, -1, NULL, 0);
4593320a88dSreyk 		break;
4603320a88dSreyk 	default:
4613320a88dSreyk 		fatalx("invalid vm ctl command: %d", cmd);
4623320a88dSreyk 	}
463dd7efffeSclaudio 	imsgbuf_flush(&current_vm->vm_iev.ibuf);
4643320a88dSreyk 
465f4b47ae8Sbluhm 	if (sev_shutdown(current_vm))
466f4b47ae8Sbluhm 		log_warnx("%s: could not shutdown SEV", __func__);
467f4b47ae8Sbluhm 
4683320a88dSreyk 	_exit(0);
4693320a88dSreyk }
4703320a88dSreyk 
471eed20f3bSpd int
47273a98491Sdv send_vm(int fd, struct vmd_vm *vm)
473eed20f3bSpd {
474eed20f3bSpd 	struct vm_rwregs_params	   vrp;
475cbd2a590Spd 	struct vm_rwvmparams_params vpp;
476eed20f3bSpd 	struct vmop_create_params *vmc;
477eed20f3bSpd 	struct vm_terminate_params vtp;
478eed20f3bSpd 	unsigned int		   flags = 0;
479eed20f3bSpd 	unsigned int		   i;
480eed20f3bSpd 	int			   ret = 0;
48168482e07Smlarkin 	size_t			   sz;
482eed20f3bSpd 
4832d671a23Spd 	if (dump_send_header(fd)) {
4843481ecdfSdv 		log_warnx("%s: failed to send vm dump header", __func__);
485eed20f3bSpd 		goto err;
486eed20f3bSpd 	}
487eed20f3bSpd 
48873a98491Sdv 	pause_vm(vm);
489eed20f3bSpd 
490eed20f3bSpd 	vmc = calloc(1, sizeof(struct vmop_create_params));
491eed20f3bSpd 	if (vmc == NULL) {
4923a50f0a9Sjmc 		log_warn("%s: calloc error getting vmc", __func__);
493eed20f3bSpd 		ret = -1;
494eed20f3bSpd 		goto err;
495eed20f3bSpd 	}
496eed20f3bSpd 
497eed20f3bSpd 	flags |= VMOP_CREATE_MEMORY;
498eed20f3bSpd 	memcpy(&vmc->vmc_params, &current_vm->vm_params, sizeof(struct
499eed20f3bSpd 	    vmop_create_params));
500eed20f3bSpd 	vmc->vmc_flags = flags;
50173a98491Sdv 	vrp.vrwp_vm_id = vm->vm_params.vmc_params.vcp_id;
502eed20f3bSpd 	vrp.vrwp_mask = VM_RWREGS_ALL;
503cbd2a590Spd 	vpp.vpp_mask = VM_RWVMPARAMS_ALL;
50473a98491Sdv 	vpp.vpp_vm_id = vm->vm_params.vmc_params.vcp_id;
505eed20f3bSpd 
50668482e07Smlarkin 	sz = atomicio(vwrite, fd, vmc, sizeof(struct vmop_create_params));
50768482e07Smlarkin 	if (sz != sizeof(struct vmop_create_params)) {
50868482e07Smlarkin 		ret = -1;
509eed20f3bSpd 		goto err;
51068482e07Smlarkin 	}
511eed20f3bSpd 
51273a98491Sdv 	for (i = 0; i < vm->vm_params.vmc_params.vcp_ncpus; i++) {
513eed20f3bSpd 		vrp.vrwp_vcpu_id = i;
514eed20f3bSpd 		if ((ret = ioctl(env->vmd_fd, VMM_IOC_READREGS, &vrp))) {
515eed20f3bSpd 			log_warn("%s: readregs failed", __func__);
516eed20f3bSpd 			goto err;
517eed20f3bSpd 		}
51868482e07Smlarkin 
51968482e07Smlarkin 		sz = atomicio(vwrite, fd, &vrp,
52068482e07Smlarkin 		    sizeof(struct vm_rwregs_params));
52168482e07Smlarkin 		if (sz != sizeof(struct vm_rwregs_params)) {
522eed20f3bSpd 			log_warn("%s: dumping registers failed", __func__);
52368482e07Smlarkin 			ret = -1;
524eed20f3bSpd 			goto err;
525eed20f3bSpd 		}
526eed20f3bSpd 	}
527eed20f3bSpd 
5283481ecdfSdv 	/* Dump memory before devices to aid in restoration. */
5293481ecdfSdv 	if ((ret = dump_mem(fd, vm)))
5303481ecdfSdv 		goto err;
531c4fd4c5bSdv 	if ((ret = dump_devs(fd)))
532622c1441Sclaudio 		goto err;
533813e3047Spd 	if ((ret = pci_dump(fd)))
534813e3047Spd 		goto err;
535eed20f3bSpd 	if ((ret = virtio_dump(fd)))
536eed20f3bSpd 		goto err;
537eed20f3bSpd 
53873a98491Sdv 	for (i = 0; i < vm->vm_params.vmc_params.vcp_ncpus; i++) {
539cbd2a590Spd 		vpp.vpp_vcpu_id = i;
540cbd2a590Spd 		if ((ret = ioctl(env->vmd_fd, VMM_IOC_READVMPARAMS, &vpp))) {
541cbd2a590Spd 			log_warn("%s: readvmparams failed", __func__);
542cbd2a590Spd 			goto err;
543cbd2a590Spd 		}
544cbd2a590Spd 
545cbd2a590Spd 		sz = atomicio(vwrite, fd, &vpp,
546cbd2a590Spd 		    sizeof(struct vm_rwvmparams_params));
547cbd2a590Spd 		if (sz != sizeof(struct vm_rwvmparams_params)) {
548cbd2a590Spd 			log_warn("%s: dumping vm params failed", __func__);
549cbd2a590Spd 			ret = -1;
550cbd2a590Spd 			goto err;
551cbd2a590Spd 		}
552cbd2a590Spd 	}
553cbd2a590Spd 
55473a98491Sdv 	vtp.vtp_vm_id = vm->vm_params.vmc_params.vcp_id;
555df69c215Sderaadt 	if (ioctl(env->vmd_fd, VMM_IOC_TERM, &vtp) == -1) {
556eed20f3bSpd 		log_warnx("%s: term IOC error: %d, %d", __func__,
557eed20f3bSpd 		    errno, ENOENT);
558eed20f3bSpd 	}
559eed20f3bSpd err:
560eed20f3bSpd 	close(fd);
561eed20f3bSpd 	if (ret)
56273a98491Sdv 		unpause_vm(vm);
563eed20f3bSpd 	return ret;
564eed20f3bSpd }
565eed20f3bSpd 
566eed20f3bSpd int
56773a98491Sdv dump_mem(int fd, struct vmd_vm *vm)
568eed20f3bSpd {
569eed20f3bSpd 	unsigned int	i;
570eed20f3bSpd 	int		ret;
571eed20f3bSpd 	struct		vm_mem_range *vmr;
572eed20f3bSpd 
57373a98491Sdv 	for (i = 0; i < vm->vm_params.vmc_params.vcp_nmemranges; i++) {
57473a98491Sdv 		vmr = &vm->vm_params.vmc_params.vcp_memranges[i];
575eed20f3bSpd 		ret = dump_vmr(fd, vmr);
576eed20f3bSpd 		if (ret)
577eed20f3bSpd 			return ret;
578eed20f3bSpd 	}
579eed20f3bSpd 	return (0);
580eed20f3bSpd }
581eed20f3bSpd 
582cbd2a590Spd int
583cbd2a590Spd restore_vm_params(int fd, struct vm_create_params *vcp) {
584cbd2a590Spd 	unsigned int			i;
585cbd2a590Spd 	struct vm_rwvmparams_params    vpp;
586cbd2a590Spd 
587cbd2a590Spd 	for (i = 0; i < vcp->vcp_ncpus; i++) {
588cbd2a590Spd 		if (atomicio(read, fd, &vpp, sizeof(vpp)) != sizeof(vpp)) {
589cbd2a590Spd 			log_warn("%s: error restoring vm params", __func__);
590cbd2a590Spd 			return (-1);
591cbd2a590Spd 		}
592cbd2a590Spd 		vpp.vpp_vm_id = vcp->vcp_id;
593cbd2a590Spd 		vpp.vpp_vcpu_id = i;
594cbd2a590Spd 		if (ioctl(env->vmd_fd, VMM_IOC_WRITEVMPARAMS, &vpp) < 0) {
595cbd2a590Spd 			log_debug("%s: writing vm params failed", __func__);
596cbd2a590Spd 			return (-1);
597cbd2a590Spd 		}
598cbd2a590Spd 	}
599cbd2a590Spd 	return (0);
600cbd2a590Spd }
601cbd2a590Spd 
602eed20f3bSpd void
603eed20f3bSpd restore_mem(int fd, struct vm_create_params *vcp)
604eed20f3bSpd {
605eed20f3bSpd 	unsigned int	     i;
606eed20f3bSpd 	struct vm_mem_range *vmr;
607eed20f3bSpd 
608eed20f3bSpd 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
609eed20f3bSpd 		vmr = &vcp->vcp_memranges[i];
610eed20f3bSpd 		restore_vmr(fd, vmr);
611eed20f3bSpd 	}
612eed20f3bSpd }
613eed20f3bSpd 
614eed20f3bSpd int
615eed20f3bSpd dump_vmr(int fd, struct vm_mem_range *vmr)
616eed20f3bSpd {
617eed20f3bSpd 	size_t	rem = vmr->vmr_size, read=0;
618eed20f3bSpd 	char	buf[PAGE_SIZE];
619eed20f3bSpd 
620eed20f3bSpd 	while (rem > 0) {
621eed20f3bSpd 		if (read_mem(vmr->vmr_gpa + read, buf, PAGE_SIZE)) {
622eed20f3bSpd 			log_warn("failed to read vmr");
623eed20f3bSpd 			return (-1);
624eed20f3bSpd 		}
625eed20f3bSpd 		if (atomicio(vwrite, fd, buf, sizeof(buf)) != sizeof(buf)) {
626eed20f3bSpd 			log_warn("failed to dump vmr");
627eed20f3bSpd 			return (-1);
628eed20f3bSpd 		}
629eed20f3bSpd 		rem = rem - PAGE_SIZE;
630eed20f3bSpd 		read = read + PAGE_SIZE;
631eed20f3bSpd 	}
632eed20f3bSpd 	return (0);
633eed20f3bSpd }
634eed20f3bSpd 
635eed20f3bSpd void
636eed20f3bSpd restore_vmr(int fd, struct vm_mem_range *vmr)
637eed20f3bSpd {
638eed20f3bSpd 	size_t	rem = vmr->vmr_size, wrote=0;
639eed20f3bSpd 	char	buf[PAGE_SIZE];
640eed20f3bSpd 
641eed20f3bSpd 	while (rem > 0) {
642eed20f3bSpd 		if (atomicio(read, fd, buf, sizeof(buf)) != sizeof(buf))
643eed20f3bSpd 			fatal("failed to restore vmr");
644eed20f3bSpd 		if (write_mem(vmr->vmr_gpa + wrote, buf, PAGE_SIZE))
645eed20f3bSpd 			fatal("failed to write vmr");
646eed20f3bSpd 		rem = rem - PAGE_SIZE;
647eed20f3bSpd 		wrote = wrote + PAGE_SIZE;
648eed20f3bSpd 	}
649eed20f3bSpd }
650eed20f3bSpd 
65173a98491Sdv static void
65273a98491Sdv pause_vm(struct vmd_vm *vm)
65352e954a3Spd {
654548054a9Spd 	unsigned int n;
655548054a9Spd 	int ret;
65652e954a3Spd 
6575195cf3eSdv 	mutex_lock(&vm_mtx);
6585195cf3eSdv 	if (vm->vm_state & VM_STATE_PAUSED) {
6595195cf3eSdv 		mutex_unlock(&vm_mtx);
6605195cf3eSdv 		return;
6615195cf3eSdv 	}
66219700f36Sjasper 	current_vm->vm_state |= VM_STATE_PAUSED;
6635195cf3eSdv 	mutex_unlock(&vm_mtx);
66452e954a3Spd 
66573a98491Sdv 	ret = pthread_barrier_init(&vm_pause_barrier, NULL,
66673a98491Sdv 	    vm->vm_params.vmc_params.vcp_ncpus + 1);
667548054a9Spd 	if (ret) {
668ad9e848cSpd 		log_warnx("%s: cannot initialize pause barrier (%d)",
669ad9e848cSpd 		    __progname, ret);
670548054a9Spd 		return;
671548054a9Spd 	}
672548054a9Spd 
67373a98491Sdv 	for (n = 0; n < vm->vm_params.vmc_params.vcp_ncpus; n++) {
674548054a9Spd 		ret = pthread_cond_broadcast(&vcpu_run_cond[n]);
675548054a9Spd 		if (ret) {
676548054a9Spd 			log_warnx("%s: can't broadcast vcpu run cond (%d)",
677548054a9Spd 			    __func__, (int)ret);
678548054a9Spd 			return;
679548054a9Spd 		}
680ad9e848cSpd 	}
681ad9e848cSpd 	ret = pthread_barrier_wait(&vm_pause_barrier);
682ad9e848cSpd 	if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) {
683ad9e848cSpd 		log_warnx("%s: could not wait on pause barrier (%d)",
684ad9e848cSpd 		    __func__, (int)ret);
685ad9e848cSpd 		return;
686ad9e848cSpd 	}
687548054a9Spd 
688ad9e848cSpd 	ret = pthread_barrier_destroy(&vm_pause_barrier);
689548054a9Spd 	if (ret) {
690ad9e848cSpd 		log_warnx("%s: could not destroy pause barrier (%d)",
691ad9e848cSpd 		    __progname, ret);
692548054a9Spd 		return;
693548054a9Spd 	}
69452e954a3Spd 
695c4fd4c5bSdv 	pause_vm_md(vm);
69652e954a3Spd }
69752e954a3Spd 
69873a98491Sdv static void
69973a98491Sdv unpause_vm(struct vmd_vm *vm)
70052e954a3Spd {
70152e954a3Spd 	unsigned int n;
702548054a9Spd 	int ret;
70352e954a3Spd 
7045195cf3eSdv 	mutex_lock(&vm_mtx);
7055195cf3eSdv 	if (!(vm->vm_state & VM_STATE_PAUSED)) {
7065195cf3eSdv 		mutex_unlock(&vm_mtx);
7075195cf3eSdv 		return;
7085195cf3eSdv 	}
70919700f36Sjasper 	current_vm->vm_state &= ~VM_STATE_PAUSED;
7105195cf3eSdv 	mutex_unlock(&vm_mtx);
7115195cf3eSdv 
71273a98491Sdv 	for (n = 0; n < vm->vm_params.vmc_params.vcp_ncpus; n++) {
713548054a9Spd 		ret = pthread_cond_broadcast(&vcpu_unpause_cond[n]);
714548054a9Spd 		if (ret) {
715548054a9Spd 			log_warnx("%s: can't broadcast vcpu unpause cond (%d)",
716548054a9Spd 			    __func__, (int)ret);
717548054a9Spd 			return;
718548054a9Spd 		}
719548054a9Spd 	}
72052e954a3Spd 
721c4fd4c5bSdv 	unpause_vm_md(vm);
72252e954a3Spd }
72352e954a3Spd 
7243320a88dSreyk /*
7251e1977eeSreyk  * vcpu_reset
7261e1977eeSreyk  *
7271e1977eeSreyk  * Requests vmm(4) to reset the VCPUs in the indicated VM to
7281e1977eeSreyk  * the register state provided
7291e1977eeSreyk  *
7301e1977eeSreyk  * Parameters
7311e1977eeSreyk  *  vmid: VM ID to reset
7321e1977eeSreyk  *  vcpu_id: VCPU ID to reset
7331e1977eeSreyk  *  vrs: the register state to initialize
7341e1977eeSreyk  *
7351e1977eeSreyk  * Return values:
7361e1977eeSreyk  *  0: success
7371e1977eeSreyk  *  !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not
7381e1977eeSreyk  *      valid)
7391e1977eeSreyk  */
7401e1977eeSreyk int
7411e1977eeSreyk vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs)
7421e1977eeSreyk {
7431e1977eeSreyk 	struct vm_resetcpu_params vrp;
7441e1977eeSreyk 
7451e1977eeSreyk 	memset(&vrp, 0, sizeof(vrp));
7461e1977eeSreyk 	vrp.vrp_vm_id = vmid;
7471e1977eeSreyk 	vrp.vrp_vcpu_id = vcpu_id;
7481e1977eeSreyk 	memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state));
7491e1977eeSreyk 
7501e1977eeSreyk 	log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid);
7511e1977eeSreyk 
752df69c215Sderaadt 	if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) == -1)
7531e1977eeSreyk 		return (errno);
7541e1977eeSreyk 
7551e1977eeSreyk 	return (0);
7561e1977eeSreyk }
7571e1977eeSreyk 
7581e1977eeSreyk /*
7591e1977eeSreyk  * alloc_guest_mem
7601e1977eeSreyk  *
7611e1977eeSreyk  * Allocates memory for the guest.
7621e1977eeSreyk  * Instead of doing a single allocation with one mmap(), we allocate memory
7631e1977eeSreyk  * separately for every range for the following reasons:
7641e1977eeSreyk  * - ASLR for the individual ranges
7651e1977eeSreyk  * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to
7661e1977eeSreyk  *   map the single mmap'd userspace memory to the individual guest physical
7671e1977eeSreyk  *   memory ranges, the underlying amap of the single mmap'd range would have
7681e1977eeSreyk  *   to allocate per-page reference counters. The reason is that the
7691e1977eeSreyk  *   individual guest physical ranges would reference the single mmap'd region
7701e1977eeSreyk  *   only partially. However, if every guest physical range has its own
7711e1977eeSreyk  *   corresponding mmap'd userspace allocation, there are no partial
7721e1977eeSreyk  *   references: every guest physical range fully references an mmap'd
7731e1977eeSreyk  *   range => no per-page reference counters have to be allocated.
7741e1977eeSreyk  *
7751e1977eeSreyk  * Return values:
7761e1977eeSreyk  *  0: success
7771e1977eeSreyk  *  !0: failure - errno indicating the source of the failure
7781e1977eeSreyk  */
7791e1977eeSreyk int
7803481ecdfSdv alloc_guest_mem(struct vmd_vm *vm)
7811e1977eeSreyk {
7821e1977eeSreyk 	void *p;
7833c817da7Sdv 	int ret = 0;
7841e1977eeSreyk 	size_t i, j;
7853481ecdfSdv 	struct vm_create_params *vcp = &vm->vm_params.vmc_params;
7861e1977eeSreyk 	struct vm_mem_range *vmr;
7871e1977eeSreyk 
7881e1977eeSreyk 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
7891e1977eeSreyk 		vmr = &vcp->vcp_memranges[i];
7903481ecdfSdv 
7913c817da7Sdv 		/*
7923c817da7Sdv 		 * We only need R/W as userland. vmm(4) will use R/W/X in its
7933c817da7Sdv 		 * mapping.
7943c817da7Sdv 		 *
7953c817da7Sdv 		 * We must use MAP_SHARED so emulated devices will be able
7963c817da7Sdv 		 * to generate shared mappings.
7973c817da7Sdv 		 */
7981e1977eeSreyk 		p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
7993c817da7Sdv 		    MAP_ANON | MAP_CONCEAL | MAP_SHARED, -1, 0);
8001e1977eeSreyk 		if (p == MAP_FAILED) {
8011e1977eeSreyk 			ret = errno;
8021e1977eeSreyk 			for (j = 0; j < i; j++) {
8031e1977eeSreyk 				vmr = &vcp->vcp_memranges[j];
8041e1977eeSreyk 				munmap((void *)vmr->vmr_va, vmr->vmr_size);
8051e1977eeSreyk 			}
8063c817da7Sdv 			return (ret);
8071e1977eeSreyk 		}
8081e1977eeSreyk 		vmr->vmr_va = (vaddr_t)p;
8091e1977eeSreyk 	}
8103c817da7Sdv 
8113481ecdfSdv 	return (ret);
8121e1977eeSreyk }
8131e1977eeSreyk 
8141e1977eeSreyk /*
8151e1977eeSreyk  * vmm_create_vm
8161e1977eeSreyk  *
8171e1977eeSreyk  * Requests vmm(4) to create a new VM using the supplied creation
8181e1977eeSreyk  * parameters. This operation results in the creation of the in-kernel
8191e1977eeSreyk  * structures for the VM, but does not start the VM's vcpu(s).
8201e1977eeSreyk  *
8211e1977eeSreyk  * Parameters:
82273a98491Sdv  *  vm: pointer to the vm object
8231e1977eeSreyk  *
8241e1977eeSreyk  * Return values:
8251e1977eeSreyk  *  0: success
8261e1977eeSreyk  *  !0 : ioctl to vmm(4) failed
8271e1977eeSreyk  */
82873a98491Sdv static int
82973a98491Sdv vmm_create_vm(struct vmd_vm *vm)
8301e1977eeSreyk {
83173a98491Sdv 	struct vm_create_params *vcp = &vm->vm_params.vmc_params;
832f4b47ae8Sbluhm 	size_t i;
83373a98491Sdv 
8341e1977eeSreyk 	/* Sanity check arguments */
8351e1977eeSreyk 	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
8361e1977eeSreyk 		return (EINVAL);
8371e1977eeSreyk 
8381e1977eeSreyk 	if (vcp->vcp_nmemranges == 0 ||
8391e1977eeSreyk 	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
8401e1977eeSreyk 		return (EINVAL);
8411e1977eeSreyk 
84273a98491Sdv 	if (vm->vm_params.vmc_ndisks > VM_MAX_DISKS_PER_VM)
8431e1977eeSreyk 		return (EINVAL);
8441e1977eeSreyk 
84573a98491Sdv 	if (vm->vm_params.vmc_nnics > VM_MAX_NICS_PER_VM)
8461e1977eeSreyk 		return (EINVAL);
8471e1977eeSreyk 
848df69c215Sderaadt 	if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) == -1)
8491e1977eeSreyk 		return (errno);
8501e1977eeSreyk 
851f4b47ae8Sbluhm 	for (i = 0; i < vcp->vcp_ncpus; i++)
852f4b47ae8Sbluhm 		vm->vm_sev_asid[i] = vcp->vcp_asid[i];
853f4b47ae8Sbluhm 
8541e1977eeSreyk 	return (0);
8551e1977eeSreyk }
8561e1977eeSreyk 
8571e1977eeSreyk 
8581e1977eeSreyk 	/*
8591e1977eeSreyk  * run_vm
8601e1977eeSreyk  *
8611e1977eeSreyk  * Runs the VM whose creation parameters are specified in vcp
8621e1977eeSreyk  *
8631e1977eeSreyk  * Parameters:
86495ab188fSccardenas  *  child_cdrom: previously-opened child ISO disk file descriptor
8651e1977eeSreyk  *  child_disks: previously-opened child VM disk file file descriptors
8661e1977eeSreyk  *  child_taps: previously-opened child tap file descriptors
8672b2a5f0dSreyk  *  vmc: vmop_create_params struct containing the VM's desired creation
8681e1977eeSreyk  *      configuration
8691e1977eeSreyk  *  vrs: VCPU register state to initialize
8701e1977eeSreyk  *
8711e1977eeSreyk  * Return values:
8721e1977eeSreyk  *  0: the VM exited normally
8731e1977eeSreyk  *  !0 : the VM exited abnormally or failed to start
8741e1977eeSreyk  */
87524386e31Sdv static int
87624386e31Sdv run_vm(struct vmop_create_params *vmc, struct vcpu_reg_state *vrs)
8771e1977eeSreyk {
8782b2a5f0dSreyk 	struct vm_create_params *vcp = &vmc->vmc_params;
879eed20f3bSpd 	struct vm_rwregs_params vregsp;
8801e1977eeSreyk 	uint8_t evdone = 0;
8811e1977eeSreyk 	size_t i;
8821e1977eeSreyk 	int ret;
8831e1977eeSreyk 	pthread_t *tid, evtid;
884ad00e8c1Sdv 	char tname[MAXCOMLEN + 1];
8851e1977eeSreyk 	struct vm_run_params **vrp;
8861e1977eeSreyk 	void *exit_status;
8871e1977eeSreyk 
8881e1977eeSreyk 	if (vcp == NULL)
8891e1977eeSreyk 		return (EINVAL);
8901e1977eeSreyk 
8911e1977eeSreyk 	if (vcp->vcp_nmemranges == 0 ||
8921e1977eeSreyk 	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
8931e1977eeSreyk 		return (EINVAL);
8941e1977eeSreyk 
8951e1977eeSreyk 	tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t));
8961e1977eeSreyk 	vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *));
8971e1977eeSreyk 	if (tid == NULL || vrp == NULL) {
8981e1977eeSreyk 		log_warn("%s: memory allocation error - exiting.",
8991e1977eeSreyk 		    __progname);
9001e1977eeSreyk 		return (ENOMEM);
9011e1977eeSreyk 	}
9021e1977eeSreyk 
90324386e31Sdv 	log_debug("%s: starting %zu vcpu thread(s) for vm %s", __func__,
90424386e31Sdv 	    vcp->vcp_ncpus, vcp->vcp_name);
9051e1977eeSreyk 
9061e1977eeSreyk 	/*
9071e1977eeSreyk 	 * Create and launch one thread for each VCPU. These threads may
9081e1977eeSreyk 	 * migrate between PCPUs over time; the need to reload CPU state
9091e1977eeSreyk 	 * in such situations is detected and performed by vmm(4) in the
9101e1977eeSreyk 	 * kernel.
9111e1977eeSreyk 	 */
9121e1977eeSreyk 	for (i = 0 ; i < vcp->vcp_ncpus; i++) {
9131e1977eeSreyk 		vrp[i] = malloc(sizeof(struct vm_run_params));
9141e1977eeSreyk 		if (vrp[i] == NULL) {
9151e1977eeSreyk 			log_warn("%s: memory allocation error - "
9161e1977eeSreyk 			    "exiting.", __progname);
9176ad82823Smlarkin 			/* caller will exit, so skip freeing */
9181e1977eeSreyk 			return (ENOMEM);
9191e1977eeSreyk 		}
92002ee787fSmlarkin 		vrp[i]->vrp_exit = malloc(sizeof(struct vm_exit));
9211e1977eeSreyk 		if (vrp[i]->vrp_exit == NULL) {
9221e1977eeSreyk 			log_warn("%s: memory allocation error - "
9231e1977eeSreyk 			    "exiting.", __progname);
9246ad82823Smlarkin 			/* caller will exit, so skip freeing */
9251e1977eeSreyk 			return (ENOMEM);
9261e1977eeSreyk 		}
9271e1977eeSreyk 		vrp[i]->vrp_vm_id = vcp->vcp_id;
9281e1977eeSreyk 		vrp[i]->vrp_vcpu_id = i;
9291e1977eeSreyk 
9301e1977eeSreyk 		if (vcpu_reset(vcp->vcp_id, i, vrs)) {
9311e1977eeSreyk 			log_warnx("%s: cannot reset VCPU %zu - exiting.",
9321e1977eeSreyk 			    __progname, i);
9331e1977eeSreyk 			return (EIO);
9341e1977eeSreyk 		}
9351e1977eeSreyk 
936f4b47ae8Sbluhm 		if (sev_activate(current_vm, i)) {
937f4b47ae8Sbluhm 			log_warnx("%s: SEV activatation failed for VCPU "
938f4b47ae8Sbluhm 			    "%zu failed - exiting.", __progname, i);
939f4b47ae8Sbluhm 			return (EIO);
940f4b47ae8Sbluhm 		}
941f4b47ae8Sbluhm 
942f4b47ae8Sbluhm 		if (sev_encrypt_memory(current_vm)) {
943f4b47ae8Sbluhm 			log_warnx("%s: memory encryption failed for VCPU "
944f4b47ae8Sbluhm 			    "%zu failed - exiting.", __progname, i);
945f4b47ae8Sbluhm 			return (EIO);
946f4b47ae8Sbluhm 		}
947f4b47ae8Sbluhm 
9486ad82823Smlarkin 		/* once more because reset_cpu changes regs */
94919700f36Sjasper 		if (current_vm->vm_state & VM_STATE_RECEIVED) {
950eed20f3bSpd 			vregsp.vrwp_vm_id = vcp->vcp_id;
951eed20f3bSpd 			vregsp.vrwp_vcpu_id = i;
952eed20f3bSpd 			vregsp.vrwp_regs = *vrs;
953eed20f3bSpd 			vregsp.vrwp_mask = VM_RWREGS_ALL;
954eed20f3bSpd 			if ((ret = ioctl(env->vmd_fd, VMM_IOC_WRITEREGS,
955df69c215Sderaadt 			    &vregsp)) == -1) {
956eed20f3bSpd 				log_warn("%s: writeregs failed", __func__);
957eed20f3bSpd 				return (ret);
958eed20f3bSpd 			}
959eed20f3bSpd 		}
960eed20f3bSpd 
9611e1977eeSreyk 		ret = pthread_cond_init(&vcpu_run_cond[i], NULL);
9621e1977eeSreyk 		if (ret) {
9631e1977eeSreyk 			log_warnx("%s: cannot initialize cond var (%d)",
9641e1977eeSreyk 			    __progname, ret);
9651e1977eeSreyk 			return (ret);
9661e1977eeSreyk 		}
9671e1977eeSreyk 
9681e1977eeSreyk 		ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL);
9691e1977eeSreyk 		if (ret) {
9701e1977eeSreyk 			log_warnx("%s: cannot initialize mtx (%d)",
9711e1977eeSreyk 			    __progname, ret);
9721e1977eeSreyk 			return (ret);
9731e1977eeSreyk 		}
974548054a9Spd 
975548054a9Spd 		ret = pthread_cond_init(&vcpu_unpause_cond[i], NULL);
976548054a9Spd 		if (ret) {
977548054a9Spd 			log_warnx("%s: cannot initialize unpause var (%d)",
978548054a9Spd 			    __progname, ret);
979548054a9Spd 			return (ret);
980548054a9Spd 		}
981548054a9Spd 
982548054a9Spd 		ret = pthread_mutex_init(&vcpu_unpause_mtx[i], NULL);
983548054a9Spd 		if (ret) {
984548054a9Spd 			log_warnx("%s: cannot initialize unpause mtx (%d)",
985548054a9Spd 			    __progname, ret);
986548054a9Spd 			return (ret);
987548054a9Spd 		}
9881e1977eeSreyk 
9891e1977eeSreyk 		vcpu_hlt[i] = 0;
9901e1977eeSreyk 
9911e1977eeSreyk 		/* Start each VCPU run thread at vcpu_run_loop */
9921e1977eeSreyk 		ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]);
9931e1977eeSreyk 		if (ret) {
9941e1977eeSreyk 			/* caller will _exit after this return */
9951e1977eeSreyk 			ret = errno;
9961e1977eeSreyk 			log_warn("%s: could not create vcpu thread %zu",
9971e1977eeSreyk 			    __func__, i);
9981e1977eeSreyk 			return (ret);
9991e1977eeSreyk 		}
1000ad00e8c1Sdv 
1001ad00e8c1Sdv 		snprintf(tname, sizeof(tname), "vcpu-%zu", i);
1002ad00e8c1Sdv 		pthread_set_name_np(tid[i], tname);
10031e1977eeSreyk 	}
10041e1977eeSreyk 
10051e1977eeSreyk 	log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name);
10061e1977eeSreyk 	ret = pthread_create(&evtid, NULL, event_thread, &evdone);
10071e1977eeSreyk 	if (ret) {
10081e1977eeSreyk 		errno = ret;
10091e1977eeSreyk 		log_warn("%s: could not create event thread", __func__);
10101e1977eeSreyk 		return (ret);
10111e1977eeSreyk 	}
1012ad00e8c1Sdv 	pthread_set_name_np(evtid, "event");
10131e1977eeSreyk 
10141e1977eeSreyk 	for (;;) {
10151e1977eeSreyk 		ret = pthread_cond_wait(&threadcond, &threadmutex);
10161e1977eeSreyk 		if (ret) {
10171e1977eeSreyk 			log_warn("%s: waiting on thread state condition "
10181e1977eeSreyk 			    "variable failed", __func__);
10191e1977eeSreyk 			return (ret);
10201e1977eeSreyk 		}
10211e1977eeSreyk 
10221e1977eeSreyk 		/*
10231e1977eeSreyk 		 * Did a VCPU thread exit with an error? => return the first one
10241e1977eeSreyk 		 */
10255195cf3eSdv 		mutex_lock(&vm_mtx);
10261e1977eeSreyk 		for (i = 0; i < vcp->vcp_ncpus; i++) {
10271e1977eeSreyk 			if (vcpu_done[i] == 0)
10281e1977eeSreyk 				continue;
10291e1977eeSreyk 
10301e1977eeSreyk 			if (pthread_join(tid[i], &exit_status)) {
10311e1977eeSreyk 				log_warn("%s: failed to join thread %zd - "
10321e1977eeSreyk 				    "exiting", __progname, i);
10335195cf3eSdv 				mutex_unlock(&vm_mtx);
10341e1977eeSreyk 				return (EIO);
10351e1977eeSreyk 			}
10361e1977eeSreyk 
10376a9ef12bSmlarkin 			ret = (intptr_t)exit_status;
10381e1977eeSreyk 		}
10395195cf3eSdv 		mutex_unlock(&vm_mtx);
10401e1977eeSreyk 
10411e1977eeSreyk 		/* Did the event thread exit? => return with an error */
10421e1977eeSreyk 		if (evdone) {
10431e1977eeSreyk 			if (pthread_join(evtid, &exit_status)) {
10441e1977eeSreyk 				log_warn("%s: failed to join event thread - "
10451e1977eeSreyk 				    "exiting", __progname);
10461e1977eeSreyk 				return (EIO);
10471e1977eeSreyk 			}
10481e1977eeSreyk 
10491e1977eeSreyk 			log_warnx("%s: vm %d event thread exited "
10501e1977eeSreyk 			    "unexpectedly", __progname, vcp->vcp_id);
10511e1977eeSreyk 			return (EIO);
10521e1977eeSreyk 		}
10531e1977eeSreyk 
10541e1977eeSreyk 		/* Did all VCPU threads exit successfully? => return */
10555195cf3eSdv 		mutex_lock(&vm_mtx);
10561e1977eeSreyk 		for (i = 0; i < vcp->vcp_ncpus; i++) {
10571e1977eeSreyk 			if (vcpu_done[i] == 0)
10581e1977eeSreyk 				break;
10591e1977eeSreyk 		}
10605195cf3eSdv 		mutex_unlock(&vm_mtx);
10611e1977eeSreyk 		if (i == vcp->vcp_ncpus)
10621e1977eeSreyk 			return (ret);
10631e1977eeSreyk 
10641e1977eeSreyk 		/* Some more threads to wait for, start over */
10651e1977eeSreyk 	}
10661e1977eeSreyk 
10671e1977eeSreyk 	return (ret);
10681e1977eeSreyk }
10691e1977eeSreyk 
1070c4fd4c5bSdv static void *
10711e1977eeSreyk event_thread(void *arg)
10721e1977eeSreyk {
10731e1977eeSreyk 	uint8_t *donep = arg;
10741e1977eeSreyk 	intptr_t ret;
10751e1977eeSreyk 
10769d3767a2Smlarkin 	ret = event_dispatch();
10771e1977eeSreyk 
10781e1977eeSreyk 	*donep = 1;
10795195cf3eSdv 
10805195cf3eSdv 	mutex_lock(&threadmutex);
10811e1977eeSreyk 	pthread_cond_signal(&threadcond);
10821e1977eeSreyk 	mutex_unlock(&threadmutex);
10831e1977eeSreyk 
10841e1977eeSreyk 	return (void *)ret;
10851e1977eeSreyk  }
10861e1977eeSreyk 
10871e1977eeSreyk /*
10881e1977eeSreyk  * vcpu_run_loop
10891e1977eeSreyk  *
10901e1977eeSreyk  * Runs a single VCPU until vmm(4) requires help handling an exit,
10911e1977eeSreyk  * or the VM terminates.
10921e1977eeSreyk  *
10931e1977eeSreyk  * Parameters:
10941e1977eeSreyk  *  arg: vcpu_run_params for the VCPU being run by this thread
10951e1977eeSreyk  *
10961e1977eeSreyk  * Return values:
10971e1977eeSreyk  *  NULL: the VCPU shutdown properly
10981e1977eeSreyk  *  !NULL: error processing VCPU run, or the VCPU shutdown abnormally
10991e1977eeSreyk  */
1100c4fd4c5bSdv static void *
11011e1977eeSreyk vcpu_run_loop(void *arg)
11021e1977eeSreyk {
11031e1977eeSreyk 	struct vm_run_params *vrp = (struct vm_run_params *)arg;
11041e1977eeSreyk 	intptr_t ret = 0;
11055195cf3eSdv 	uint32_t n = vrp->vrp_vcpu_id;
11065195cf3eSdv 	int paused = 0, halted = 0;
11071e1977eeSreyk 
11081e1977eeSreyk 	for (;;) {
11091e1977eeSreyk 		ret = pthread_mutex_lock(&vcpu_run_mtx[n]);
11101e1977eeSreyk 
11111e1977eeSreyk 		if (ret) {
11121e1977eeSreyk 			log_warnx("%s: can't lock vcpu run mtx (%d)",
11131e1977eeSreyk 			    __func__, (int)ret);
11141e1977eeSreyk 			return ((void *)ret);
11151e1977eeSreyk 		}
11161e1977eeSreyk 
11175195cf3eSdv 		mutex_lock(&vm_mtx);
11185195cf3eSdv 		paused = (current_vm->vm_state & VM_STATE_PAUSED) != 0;
11195195cf3eSdv 		halted = vcpu_hlt[n];
11205195cf3eSdv 		mutex_unlock(&vm_mtx);
11215195cf3eSdv 
1122548054a9Spd 		/* If we are halted and need to pause, pause */
11235195cf3eSdv 		if (halted && paused) {
1124ad9e848cSpd 			ret = pthread_barrier_wait(&vm_pause_barrier);
1125ad9e848cSpd 			if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) {
1126ad9e848cSpd 				log_warnx("%s: could not wait on pause barrier (%d)",
1127ad9e848cSpd 				    __func__, (int)ret);
1128548054a9Spd 				return ((void *)ret);
1129548054a9Spd 			}
1130548054a9Spd 
1131548054a9Spd 			ret = pthread_mutex_lock(&vcpu_unpause_mtx[n]);
1132548054a9Spd 			if (ret) {
1133548054a9Spd 				log_warnx("%s: can't lock vcpu unpause mtx (%d)",
1134548054a9Spd 				    __func__, (int)ret);
1135548054a9Spd 				return ((void *)ret);
1136548054a9Spd 			}
1137548054a9Spd 
1138c4fd4c5bSdv 			/* Interrupt may be firing, release run mtx. */
113971ab85deSdv 			mutex_unlock(&vcpu_run_mtx[n]);
1140548054a9Spd 			ret = pthread_cond_wait(&vcpu_unpause_cond[n],
1141548054a9Spd 			    &vcpu_unpause_mtx[n]);
114252e954a3Spd 			if (ret) {
114352e954a3Spd 				log_warnx(
1144548054a9Spd 				    "%s: can't wait on unpause cond (%d)",
114552e954a3Spd 				    __func__, (int)ret);
1146548054a9Spd 				break;
1147548054a9Spd 			}
114871ab85deSdv 			mutex_lock(&vcpu_run_mtx[n]);
114971ab85deSdv 
1150548054a9Spd 			ret = pthread_mutex_unlock(&vcpu_unpause_mtx[n]);
1151548054a9Spd 			if (ret) {
1152548054a9Spd 				log_warnx("%s: can't unlock unpause mtx (%d)",
1153548054a9Spd 				    __func__, (int)ret);
115452e954a3Spd 				break;
115552e954a3Spd 			}
115652e954a3Spd 		}
1157548054a9Spd 
1158548054a9Spd 		/* If we are halted and not paused, wait */
11595195cf3eSdv 		if (halted) {
11601e1977eeSreyk 			ret = pthread_cond_wait(&vcpu_run_cond[n],
11611e1977eeSreyk 			    &vcpu_run_mtx[n]);
11621e1977eeSreyk 
11631e1977eeSreyk 			if (ret) {
116452e954a3Spd 				log_warnx(
116552e954a3Spd 				    "%s: can't wait on cond (%d)",
11661e1977eeSreyk 				    __func__, (int)ret);
116752e954a3Spd 				(void)pthread_mutex_unlock(
116852e954a3Spd 				    &vcpu_run_mtx[n]);
11691e1977eeSreyk 				break;
11701e1977eeSreyk 			}
11711e1977eeSreyk 		}
11721e1977eeSreyk 
11731e1977eeSreyk 		ret = pthread_mutex_unlock(&vcpu_run_mtx[n]);
117452e954a3Spd 
11751e1977eeSreyk 		if (ret) {
11761e1977eeSreyk 			log_warnx("%s: can't unlock mutex on cond (%d)",
11771e1977eeSreyk 			    __func__, (int)ret);
11781e1977eeSreyk 			break;
11791e1977eeSreyk 		}
11801e1977eeSreyk 
1181c4fd4c5bSdv 		if (vrp->vrp_irqready && intr_pending(current_vm)) {
1182c4fd4c5bSdv 			vrp->vrp_inject.vie_vector = intr_ack(current_vm);
11831ebbcee8Sdv 			vrp->vrp_inject.vie_type = VCPU_INJECT_INTR;
11841e1977eeSreyk 		} else
11851ebbcee8Sdv 			vrp->vrp_inject.vie_type = VCPU_INJECT_NONE;
11861e1977eeSreyk 
1187524d607bSdv 		/* Still more interrupts pending? */
1188c4fd4c5bSdv 		vrp->vrp_intr_pending = intr_pending(current_vm);
11898390ed4cSdv 
1190df69c215Sderaadt 		if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) == -1) {
11911e1977eeSreyk 			/* If run ioctl failed, exit */
11921e1977eeSreyk 			ret = errno;
11931e1977eeSreyk 			log_warn("%s: vm %d / vcpu %d run ioctl failed",
1194e2bf67b2Sdv 			    __func__, current_vm->vm_vmid, n);
11951e1977eeSreyk 			break;
11961e1977eeSreyk 		}
11971e1977eeSreyk 
11981e1977eeSreyk 		/* If the VM is terminating, exit normally */
11991e1977eeSreyk 		if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) {
12001e1977eeSreyk 			ret = (intptr_t)NULL;
12011e1977eeSreyk 			break;
12021e1977eeSreyk 		}
12031e1977eeSreyk 
12041e1977eeSreyk 		if (vrp->vrp_exit_reason != VM_EXIT_NONE) {
12051e1977eeSreyk 			/*
12061e1977eeSreyk 			 * vmm(4) needs help handling an exit, handle in
12071e1977eeSreyk 			 * vcpu_exit.
12081e1977eeSreyk 			 */
12091e1977eeSreyk 			ret = vcpu_exit(vrp);
12101e1977eeSreyk 			if (ret)
12111e1977eeSreyk 				break;
12121e1977eeSreyk 		}
12131e1977eeSreyk 	}
12141e1977eeSreyk 
12155195cf3eSdv 	mutex_lock(&vm_mtx);
12161e1977eeSreyk 	vcpu_done[n] = 1;
12175195cf3eSdv 	mutex_unlock(&vm_mtx);
12185195cf3eSdv 
12195195cf3eSdv 	mutex_lock(&threadmutex);
12201e1977eeSreyk 	pthread_cond_signal(&threadcond);
12211e1977eeSreyk 	mutex_unlock(&threadmutex);
12221e1977eeSreyk 
12231e1977eeSreyk 	return ((void *)ret);
12241e1977eeSreyk }
12251e1977eeSreyk 
12261e1977eeSreyk int
1227c4fd4c5bSdv vcpu_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr)
12281e1977eeSreyk {
12291e1977eeSreyk 	struct vm_intr_params vip;
12301e1977eeSreyk 
12311e1977eeSreyk 	memset(&vip, 0, sizeof(vip));
12321e1977eeSreyk 
12331e1977eeSreyk 	vip.vip_vm_id = vm_id;
12341e1977eeSreyk 	vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */
12351e1977eeSreyk 	vip.vip_intr = intr;
12361e1977eeSreyk 
1237df69c215Sderaadt 	if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) == -1)
12381e1977eeSreyk 		return (errno);
12391e1977eeSreyk 
12401e1977eeSreyk 	return (0);
12411e1977eeSreyk }
12421e1977eeSreyk 
12431e1977eeSreyk /*
12441e1977eeSreyk  * fd_hasdata
12451e1977eeSreyk  *
12461e1977eeSreyk  * Determines if data can be read from a file descriptor.
12471e1977eeSreyk  *
12481e1977eeSreyk  * Parameters:
12491e1977eeSreyk  *  fd: the fd to check
12501e1977eeSreyk  *
12511e1977eeSreyk  * Return values:
12521e1977eeSreyk  *  1 if data can be read from an fd, or 0 otherwise.
12531e1977eeSreyk  */
12541e1977eeSreyk int
12551e1977eeSreyk fd_hasdata(int fd)
12561e1977eeSreyk {
12571e1977eeSreyk 	struct pollfd pfd[1];
12581e1977eeSreyk 	int nready, hasdata = 0;
12591e1977eeSreyk 
12601e1977eeSreyk 	pfd[0].fd = fd;
12611e1977eeSreyk 	pfd[0].events = POLLIN;
12621e1977eeSreyk 	nready = poll(pfd, 1, 0);
12631e1977eeSreyk 	if (nready == -1)
12641e1977eeSreyk 		log_warn("checking file descriptor for data failed");
12651e1977eeSreyk 	else if (nready == 1 && pfd[0].revents & POLLIN)
12661e1977eeSreyk 		hasdata = 1;
12671e1977eeSreyk 	return (hasdata);
12681e1977eeSreyk }
12691e1977eeSreyk 
12701e1977eeSreyk /*
12711e1977eeSreyk  * mutex_lock
12721e1977eeSreyk  *
12731e1977eeSreyk  * Wrapper function for pthread_mutex_lock that does error checking and that
12741e1977eeSreyk  * exits on failure
12751e1977eeSreyk  */
12761e1977eeSreyk void
12771e1977eeSreyk mutex_lock(pthread_mutex_t *m)
12781e1977eeSreyk {
12791e1977eeSreyk 	int ret;
12801e1977eeSreyk 
12811e1977eeSreyk 	ret = pthread_mutex_lock(m);
12821e1977eeSreyk 	if (ret) {
12831e1977eeSreyk 		errno = ret;
12841e1977eeSreyk 		fatal("could not acquire mutex");
12851e1977eeSreyk 	}
12861e1977eeSreyk }
12871e1977eeSreyk 
12881e1977eeSreyk /*
12891e1977eeSreyk  * mutex_unlock
12901e1977eeSreyk  *
12911e1977eeSreyk  * Wrapper function for pthread_mutex_unlock that does error checking and that
12921e1977eeSreyk  * exits on failure
12931e1977eeSreyk  */
12941e1977eeSreyk void
12951e1977eeSreyk mutex_unlock(pthread_mutex_t *m)
12961e1977eeSreyk {
12971e1977eeSreyk 	int ret;
12981e1977eeSreyk 
12991e1977eeSreyk 	ret = pthread_mutex_unlock(m);
13001e1977eeSreyk 	if (ret) {
13011e1977eeSreyk 		errno = ret;
13021e1977eeSreyk 		fatal("could not release mutex");
13031e1977eeSreyk 	}
13041e1977eeSreyk }
1305ffc3523bSmlarkin 
130608fd0ce3Spd 
1307a246f7a0Sdv void
1308a246f7a0Sdv vm_pipe_init(struct vm_dev_pipe *p, void (*cb)(int, short, void *))
1309a246f7a0Sdv {
1310a246f7a0Sdv 	vm_pipe_init2(p, cb, NULL);
1311a246f7a0Sdv }
1312a246f7a0Sdv 
131308fd0ce3Spd /*
1314a246f7a0Sdv  * vm_pipe_init2
131508fd0ce3Spd  *
131608fd0ce3Spd  * Initialize a vm_dev_pipe, setting up its file descriptors and its
1317a246f7a0Sdv  * event structure with the given callback and argument.
131808fd0ce3Spd  *
131908fd0ce3Spd  * Parameters:
132008fd0ce3Spd  *  p: pointer to vm_dev_pipe struct to initizlize
132108fd0ce3Spd  *  cb: callback to use for READ events on the read end of the pipe
1322a246f7a0Sdv  *  arg: pointer to pass to the callback on event trigger
132308fd0ce3Spd  */
132408fd0ce3Spd void
1325a246f7a0Sdv vm_pipe_init2(struct vm_dev_pipe *p, void (*cb)(int, short, void *), void *arg)
132608fd0ce3Spd {
132708fd0ce3Spd 	int ret;
132808fd0ce3Spd 	int fds[2];
132908fd0ce3Spd 
133008fd0ce3Spd 	memset(p, 0, sizeof(struct vm_dev_pipe));
133108fd0ce3Spd 
1332b3bc6112Sdv 	ret = pipe2(fds, O_CLOEXEC);
133308fd0ce3Spd 	if (ret)
133408fd0ce3Spd 		fatal("failed to create vm_dev_pipe pipe");
133508fd0ce3Spd 
133608fd0ce3Spd 	p->read = fds[0];
133708fd0ce3Spd 	p->write = fds[1];
133808fd0ce3Spd 
1339a246f7a0Sdv 	event_set(&p->read_ev, p->read, EV_READ | EV_PERSIST, cb, arg);
134008fd0ce3Spd }
134108fd0ce3Spd 
134208fd0ce3Spd /*
134308fd0ce3Spd  * vm_pipe_send
134408fd0ce3Spd  *
1345a246f7a0Sdv  * Send a message to an emulated device vie the provided vm_dev_pipe. This
1346a246f7a0Sdv  * relies on the fact sizeof(msg) < PIPE_BUF to ensure atomic writes.
134708fd0ce3Spd  *
134808fd0ce3Spd  * Parameters:
134908fd0ce3Spd  *  p: pointer to initialized vm_dev_pipe
135008fd0ce3Spd  *  msg: message to send in the channel
135108fd0ce3Spd  */
135208fd0ce3Spd void
135308fd0ce3Spd vm_pipe_send(struct vm_dev_pipe *p, enum pipe_msg_type msg)
135408fd0ce3Spd {
135508fd0ce3Spd 	size_t n;
135608fd0ce3Spd 	n = write(p->write, &msg, sizeof(msg));
135708fd0ce3Spd 	if (n != sizeof(msg))
135808fd0ce3Spd 		fatal("failed to write to device pipe");
135908fd0ce3Spd }
136008fd0ce3Spd 
136108fd0ce3Spd /*
136208fd0ce3Spd  * vm_pipe_recv
136308fd0ce3Spd  *
136408fd0ce3Spd  * Receive a message for an emulated device via the provided vm_dev_pipe.
1365a246f7a0Sdv  * Returns the message value, otherwise will exit on failure. This relies on
1366a246f7a0Sdv  * the fact sizeof(enum pipe_msg_type) < PIPE_BUF for atomic reads.
136708fd0ce3Spd  *
136808fd0ce3Spd  * Parameters:
136908fd0ce3Spd  *  p: pointer to initialized vm_dev_pipe
137008fd0ce3Spd  *
137108fd0ce3Spd  * Return values:
137208fd0ce3Spd  *  a value of enum pipe_msg_type or fatal exit on read(2) error
137308fd0ce3Spd  */
137408fd0ce3Spd enum pipe_msg_type
137508fd0ce3Spd vm_pipe_recv(struct vm_dev_pipe *p)
137608fd0ce3Spd {
137708fd0ce3Spd 	size_t n;
137808fd0ce3Spd 	enum pipe_msg_type msg;
137908fd0ce3Spd 	n = read(p->read, &msg, sizeof(msg));
138008fd0ce3Spd 	if (n != sizeof(msg))
138108fd0ce3Spd 		fatal("failed to read from device pipe");
138208fd0ce3Spd 
138308fd0ce3Spd 	return msg;
138408fd0ce3Spd }
13853481ecdfSdv 
13863481ecdfSdv /*
1387b3bc6112Sdv  * Re-map the guest address space using vmm(4)'s VMM_IOC_SHARE
13883481ecdfSdv  *
13893481ecdfSdv  * Returns 0 on success, non-zero in event of failure.
13903481ecdfSdv  */
13913481ecdfSdv int
13923c817da7Sdv remap_guest_mem(struct vmd_vm *vm, int vmm_fd)
13933481ecdfSdv {
13943481ecdfSdv 	struct vm_create_params	*vcp;
13953481ecdfSdv 	struct vm_mem_range	*vmr;
13963c817da7Sdv 	struct vm_sharemem_params vsp;
13973481ecdfSdv 	size_t			 i, j;
13983481ecdfSdv 	void			*p = NULL;
13993481ecdfSdv 	int			 ret;
14003481ecdfSdv 
14013481ecdfSdv 	if (vm == NULL)
14023481ecdfSdv 		return (1);
14033481ecdfSdv 
14043481ecdfSdv 	vcp = &vm->vm_params.vmc_params;
14053481ecdfSdv 
14063481ecdfSdv 	/*
14073c817da7Sdv 	 * Initialize our VM shared memory request using our original
14083c817da7Sdv 	 * creation parameters. We'll overwrite the va's after mmap(2).
14093c817da7Sdv 	 */
14103c817da7Sdv 	memset(&vsp, 0, sizeof(vsp));
14113c817da7Sdv 	vsp.vsp_nmemranges = vcp->vcp_nmemranges;
14123c817da7Sdv 	vsp.vsp_vm_id = vcp->vcp_id;
14133c817da7Sdv 	memcpy(&vsp.vsp_memranges, &vcp->vcp_memranges,
14143c817da7Sdv 	    sizeof(vsp.vsp_memranges));
14153c817da7Sdv 
14163c817da7Sdv 	/*
14173c817da7Sdv 	 * Use mmap(2) to identify virtual address space for our mappings.
14183481ecdfSdv 	 */
14193481ecdfSdv 	for (i = 0; i < VMM_MAX_MEM_RANGES; i++) {
14203c817da7Sdv 		if (i < vsp.vsp_nmemranges) {
14213c817da7Sdv 			vmr = &vsp.vsp_memranges[i];
14223c817da7Sdv 
14233c817da7Sdv 			/* Ignore any MMIO ranges. */
14243481ecdfSdv 			if (vmr->vmr_type == VM_MEM_MMIO) {
14253c817da7Sdv 				vmr->vmr_va = 0;
14263c817da7Sdv 				vcp->vcp_memranges[i].vmr_va = 0;
14273481ecdfSdv 				continue;
14283481ecdfSdv 			}
14293c817da7Sdv 
14303c817da7Sdv 			/* Make initial mappings for the memrange. */
14313c817da7Sdv 			p = mmap(NULL, vmr->vmr_size, PROT_READ, MAP_ANON, -1,
14323c817da7Sdv 			    0);
14333481ecdfSdv 			if (p == MAP_FAILED) {
14343481ecdfSdv 				ret = errno;
14353481ecdfSdv 				log_warn("%s: mmap", __func__);
14363481ecdfSdv 				for (j = 0; j < i; j++) {
14373481ecdfSdv 					vmr = &vcp->vcp_memranges[j];
14383481ecdfSdv 					munmap((void *)vmr->vmr_va,
14393481ecdfSdv 					    vmr->vmr_size);
14403481ecdfSdv 				}
14413481ecdfSdv 				return (ret);
14423481ecdfSdv 			}
14433481ecdfSdv 			vmr->vmr_va = (vaddr_t)p;
14443c817da7Sdv 			vcp->vcp_memranges[i].vmr_va = vmr->vmr_va;
14453481ecdfSdv 		}
14463481ecdfSdv 	}
14473481ecdfSdv 
14483c817da7Sdv 	/*
14493c817da7Sdv 	 * munmap(2) now that we have va's and ranges that don't overlap. vmm
14503c817da7Sdv 	 * will use the va's and sizes to recreate the mappings for us.
14513c817da7Sdv 	 */
14523c817da7Sdv 	for (i = 0; i < vsp.vsp_nmemranges; i++) {
14533c817da7Sdv 		vmr = &vsp.vsp_memranges[i];
14543c817da7Sdv 		if (vmr->vmr_type == VM_MEM_MMIO)
14553c817da7Sdv 			continue;
14563c817da7Sdv 		if (munmap((void*)vmr->vmr_va, vmr->vmr_size) == -1)
14573c817da7Sdv 			fatal("%s: munmap", __func__);
14583c817da7Sdv 	}
14593c817da7Sdv 
14603c817da7Sdv 	/*
14613c817da7Sdv 	 * Ask vmm to enter the shared mappings for us. They'll point
14623c817da7Sdv 	 * to the same host physical memory, but will have a randomized
14633c817da7Sdv 	 * virtual address for the calling process.
14643c817da7Sdv 	 */
14653c817da7Sdv 	if (ioctl(vmm_fd, VMM_IOC_SHAREMEM, &vsp) == -1)
14663c817da7Sdv 		return (errno);
14673c817da7Sdv 
14683481ecdfSdv 	return (0);
14693481ecdfSdv }
1470c4fd4c5bSdv 
1471c4fd4c5bSdv void
1472c4fd4c5bSdv vcpu_halt(uint32_t vcpu_id)
1473c4fd4c5bSdv {
1474c4fd4c5bSdv 	mutex_lock(&vm_mtx);
1475c4fd4c5bSdv 	vcpu_hlt[vcpu_id] = 1;
1476c4fd4c5bSdv 	mutex_unlock(&vm_mtx);
1477c4fd4c5bSdv }
1478c4fd4c5bSdv 
1479c4fd4c5bSdv void
1480c4fd4c5bSdv vcpu_unhalt(uint32_t vcpu_id)
1481c4fd4c5bSdv 	{
1482c4fd4c5bSdv 	mutex_lock(&vm_mtx);
1483c4fd4c5bSdv 	vcpu_hlt[vcpu_id] = 0;
1484c4fd4c5bSdv 	mutex_unlock(&vm_mtx);
1485c4fd4c5bSdv }
1486c4fd4c5bSdv 
1487c4fd4c5bSdv void
1488c4fd4c5bSdv vcpu_signal_run(uint32_t vcpu_id)
1489c4fd4c5bSdv {
1490c4fd4c5bSdv 	int ret;
1491c4fd4c5bSdv 
1492c4fd4c5bSdv 	mutex_lock(&vcpu_run_mtx[vcpu_id]);
1493c4fd4c5bSdv 	ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]);
1494c4fd4c5bSdv 	if (ret)
1495c4fd4c5bSdv 		fatalx("%s: can't signal (%d)", __func__, ret);
1496c4fd4c5bSdv 	mutex_unlock(&vcpu_run_mtx[vcpu_id]);
1497c4fd4c5bSdv }
1498