1*d12ef5f3Sclaudio /* $OpenBSD: vm.c,v 1.110 2024/11/21 13:25:30 claudio Exp $ */ 21e1977eeSreyk 31e1977eeSreyk /* 41e1977eeSreyk * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org> 51e1977eeSreyk * 61e1977eeSreyk * Permission to use, copy, modify, and distribute this software for any 71e1977eeSreyk * purpose with or without fee is hereby granted, provided that the above 81e1977eeSreyk * copyright notice and this permission notice appear in all copies. 91e1977eeSreyk * 101e1977eeSreyk * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 111e1977eeSreyk * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 121e1977eeSreyk * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 131e1977eeSreyk * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 141e1977eeSreyk * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 151e1977eeSreyk * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 161e1977eeSreyk * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 171e1977eeSreyk */ 181e1977eeSreyk 19ad00e8c1Sdv #include <sys/param.h> /* PAGE_SIZE, MAXCOMLEN */ 201e1977eeSreyk #include <sys/types.h> 211e1977eeSreyk #include <sys/ioctl.h> 221e1977eeSreyk #include <sys/mman.h> 23fbbcf6cdSdv #include <sys/resource.h> 241e1977eeSreyk 25ba66f564Sdv #include <dev/vmm/vmm.h> 261e1977eeSreyk 271e1977eeSreyk #include <errno.h> 281e1977eeSreyk #include <event.h> 291e1977eeSreyk #include <fcntl.h> 301e1977eeSreyk #include <imsg.h> 311e1977eeSreyk #include <poll.h> 321e1977eeSreyk #include <pthread.h> 33ad00e8c1Sdv #include <pthread_np.h> 341e1977eeSreyk #include <stdio.h> 351e1977eeSreyk #include <stdlib.h> 361e1977eeSreyk #include <string.h> 371e1977eeSreyk #include <unistd.h> 381e1977eeSreyk #include <util.h> 391e1977eeSreyk 406eb4c859Sdv #include "atomicio.h" 416eb4c859Sdv #include "pci.h" 426eb4c859Sdv #include "virtio.h" 436eb4c859Sdv #include "vmd.h" 44b1ba8534Sdv 45bee70036Sdv #define MMIO_NOTYET 0 46bee70036Sdv 4724386e31Sdv static int run_vm(struct vmop_create_params *, struct vcpu_reg_state *); 48c4fd4c5bSdv static void vm_dispatch_vmm(int, short, void *); 49c4fd4c5bSdv static void *event_thread(void *); 50c4fd4c5bSdv static void *vcpu_run_loop(void *); 5173a98491Sdv static int vmm_create_vm(struct vmd_vm *); 52c4fd4c5bSdv static int alloc_guest_mem(struct vmd_vm *); 5373a98491Sdv static int send_vm(int, struct vmd_vm *); 5473a98491Sdv static int dump_vmr(int , struct vm_mem_range *); 5573a98491Sdv static int dump_mem(int, struct vmd_vm *); 56c4fd4c5bSdv static void restore_vmr(int, struct vm_mem_range *); 57c4fd4c5bSdv static void restore_mem(int, struct vm_create_params *); 58c4fd4c5bSdv static int restore_vm_params(int, struct vm_create_params *); 5973a98491Sdv static void pause_vm(struct vmd_vm *); 6073a98491Sdv static void unpause_vm(struct vmd_vm *); 61c4fd4c5bSdv static int start_vm(struct vmd_vm *, int); 621e1977eeSreyk 631e1977eeSreyk int con_fd; 641e1977eeSreyk struct vmd_vm *current_vm; 651e1977eeSreyk 661e1977eeSreyk extern struct vmd *env; 671e1977eeSreyk 681e1977eeSreyk extern char *__progname; 691e1977eeSreyk 701e1977eeSreyk pthread_mutex_t threadmutex; 711e1977eeSreyk pthread_cond_t threadcond; 721e1977eeSreyk 731e1977eeSreyk pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM]; 741e1977eeSreyk pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM]; 75ad9e848cSpd pthread_barrier_t vm_pause_barrier; 76548054a9Spd pthread_cond_t vcpu_unpause_cond[VMM_MAX_VCPUS_PER_VM]; 77548054a9Spd pthread_mutex_t vcpu_unpause_mtx[VMM_MAX_VCPUS_PER_VM]; 785195cf3eSdv 795195cf3eSdv pthread_mutex_t vm_mtx; 801e1977eeSreyk uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM]; 811e1977eeSreyk uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM]; 821e1977eeSreyk 831e1977eeSreyk /* 8424386e31Sdv * vm_main 8524386e31Sdv * 8624386e31Sdv * Primary entrypoint for launching a vm. Does not return. 8724386e31Sdv * 8824386e31Sdv * fd: file descriptor for communicating with vmm process. 893c817da7Sdv * fd_vmm: file descriptor for communicating with vmm(4) device 9024386e31Sdv */ 9124386e31Sdv void 92b3bc6112Sdv vm_main(int fd, int fd_vmm) 9324386e31Sdv { 9424386e31Sdv struct vm_create_params *vcp = NULL; 9524386e31Sdv struct vmd_vm vm; 9624386e31Sdv size_t sz = 0; 9724386e31Sdv int ret = 0; 9824386e31Sdv 9924386e31Sdv /* 100b3bc6112Sdv * The vm process relies on global state. Set the fd for /dev/vmm. 101b3bc6112Sdv */ 102b3bc6112Sdv env->vmd_fd = fd_vmm; 103b3bc6112Sdv 104b3bc6112Sdv /* 10524386e31Sdv * We aren't root, so we can't chroot(2). Use unveil(2) instead. 10624386e31Sdv */ 1073481ecdfSdv if (unveil(env->argv0, "x") == -1) 1083481ecdfSdv fatal("unveil %s", env->argv0); 10924386e31Sdv if (unveil(NULL, NULL) == -1) 11024386e31Sdv fatal("unveil lock"); 11124386e31Sdv 11224386e31Sdv /* 11324386e31Sdv * pledge in the vm processes: 11424386e31Sdv * stdio - for malloc and basic I/O including events. 11524386e31Sdv * vmm - for the vmm ioctls and operations. 1163481ecdfSdv * proc exec - fork/exec for launching devices. 11724386e31Sdv * recvfd - for vm send/recv and sending fd to devices. 11824386e31Sdv */ 1193c817da7Sdv if (pledge("stdio vmm proc exec recvfd", NULL) == -1) 12024386e31Sdv fatal("pledge"); 12124386e31Sdv 12224386e31Sdv /* Receive our vm configuration. */ 12324386e31Sdv memset(&vm, 0, sizeof(vm)); 12424386e31Sdv sz = atomicio(read, fd, &vm, sizeof(vm)); 12524386e31Sdv if (sz != sizeof(vm)) { 12624386e31Sdv log_warnx("failed to receive start message"); 12724386e31Sdv _exit(EIO); 12824386e31Sdv } 12924386e31Sdv 13024386e31Sdv /* Update process with the vm name. */ 13124386e31Sdv vcp = &vm.vm_params.vmc_params; 13224386e31Sdv setproctitle("%s", vcp->vcp_name); 13308d0da61Sdv log_procinit("vm/%s", vcp->vcp_name); 13424386e31Sdv 1352272e586Sdv /* Receive the local prefix settings. */ 1362272e586Sdv sz = atomicio(read, fd, &env->vmd_cfg.cfg_localprefix, 1372272e586Sdv sizeof(env->vmd_cfg.cfg_localprefix)); 1382272e586Sdv if (sz != sizeof(env->vmd_cfg.cfg_localprefix)) { 1392272e586Sdv log_warnx("failed to receive local prefix"); 1402272e586Sdv _exit(EIO); 1412272e586Sdv } 1422272e586Sdv 14324386e31Sdv /* 14424386e31Sdv * We need, at minimum, a vm_kernel fd to boot a vm. This is either a 14524386e31Sdv * kernel or a BIOS image. 14624386e31Sdv */ 147b848b186Sdv if (!(vm.vm_state & VM_STATE_RECEIVED)) { 148b848b186Sdv if (vm.vm_kernel == -1) { 149b848b186Sdv log_warnx("%s: failed to receive boot fd", 150b848b186Sdv vcp->vcp_name); 15124386e31Sdv _exit(EINVAL); 15224386e31Sdv } 153b848b186Sdv } 15424386e31Sdv 155f4b47ae8Sbluhm if (vcp->vcp_sev && env->vmd_psp_fd < 0) { 156f4b47ae8Sbluhm log_warnx("%s not available", PSP_NODE); 157f4b47ae8Sbluhm _exit(EINVAL); 158f4b47ae8Sbluhm } 159f4b47ae8Sbluhm 16024386e31Sdv ret = start_vm(&vm, fd); 16124386e31Sdv _exit(ret); 16224386e31Sdv } 16324386e31Sdv 16424386e31Sdv /* 1651e1977eeSreyk * start_vm 1661e1977eeSreyk * 1671e1977eeSreyk * After forking a new VM process, starts the new VM with the creation 1681e1977eeSreyk * parameters supplied (in the incoming vm->vm_params field). This 1691e1977eeSreyk * function performs a basic sanity check on the incoming parameters 1701e1977eeSreyk * and then performs the following steps to complete the creation of the VM: 1711e1977eeSreyk * 1721e1977eeSreyk * 1. validates and create the new VM 1731e1977eeSreyk * 2. opens the imsg control channel to the parent and drops more privilege 1743a50f0a9Sjmc * 3. drops additional privileges by calling pledge(2) 1751e1977eeSreyk * 4. loads the kernel from the disk image or file descriptor 1761e1977eeSreyk * 5. runs the VM's VCPU loops. 1771e1977eeSreyk * 1781e1977eeSreyk * Parameters: 1791e1977eeSreyk * vm: The VM data structure that is including the VM create parameters. 1801e1977eeSreyk * fd: The imsg socket that is connected to the parent process. 1811e1977eeSreyk * 1821e1977eeSreyk * Return values: 1831e1977eeSreyk * 0: success 1841e1977eeSreyk * !0 : failure - typically an errno indicating the source of the failure 1851e1977eeSreyk */ 1861e1977eeSreyk int 1871e1977eeSreyk start_vm(struct vmd_vm *vm, int fd) 1881e1977eeSreyk { 189f6c09be3Sreyk struct vmop_create_params *vmc = &vm->vm_params; 190f6c09be3Sreyk struct vm_create_params *vcp = &vmc->vmc_params; 1911e1977eeSreyk struct vcpu_reg_state vrs; 192d489aa7eSdv int nicfds[VM_MAX_NICS_PER_VM]; 1931e1977eeSreyk int ret; 1941e1977eeSreyk size_t i; 195eed20f3bSpd struct vm_rwregs_params vrp; 1961e1977eeSreyk 19724386e31Sdv /* 19824386e31Sdv * We first try to initialize and allocate memory before bothering 19924386e31Sdv * vmm(4) with a request to create a new vm. 20024386e31Sdv */ 20119700f36Sjasper if (!(vm->vm_state & VM_STATE_RECEIVED)) 2021e1977eeSreyk create_memory_map(vcp); 203eed20f3bSpd 2043481ecdfSdv ret = alloc_guest_mem(vm); 2051e1977eeSreyk if (ret) { 206fbbcf6cdSdv struct rlimit lim; 207fbbcf6cdSdv char buf[FMT_SCALED_STRSIZE]; 208fbbcf6cdSdv if (ret == ENOMEM && getrlimit(RLIMIT_DATA, &lim) == 0) { 209fbbcf6cdSdv if (fmt_scaled(lim.rlim_cur, buf) == 0) 210fbbcf6cdSdv fatalx("could not allocate guest memory (data " 211fbbcf6cdSdv "limit is %s)", buf); 212fbbcf6cdSdv } 2131e1977eeSreyk errno = ret; 21424386e31Sdv log_warn("could not allocate guest memory"); 21524386e31Sdv return (ret); 2161e1977eeSreyk } 2171e1977eeSreyk 21824386e31Sdv /* We've allocated guest memory, so now create the vm in vmm(4). */ 21973a98491Sdv ret = vmm_create_vm(vm); 2201e1977eeSreyk if (ret) { 22124386e31Sdv /* Let the vmm process know we failed by sending a 0 vm id. */ 22224386e31Sdv vcp->vcp_id = 0; 22324386e31Sdv atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id)); 22424386e31Sdv return (ret); 2251e1977eeSreyk } 2261e1977eeSreyk 227f4b47ae8Sbluhm /* Setup SEV. */ 228f4b47ae8Sbluhm ret = sev_init(vm); 229f4b47ae8Sbluhm if (ret) { 230f4b47ae8Sbluhm log_warnx("could not initialize SEV"); 231f4b47ae8Sbluhm return (ret); 232f4b47ae8Sbluhm } 233f4b47ae8Sbluhm 23424386e31Sdv /* 23524386e31Sdv * Some of vmd currently relies on global state (current_vm, con_fd). 23624386e31Sdv */ 23724386e31Sdv current_vm = vm; 23824386e31Sdv con_fd = vm->vm_tty; 23924386e31Sdv if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1) { 24024386e31Sdv log_warn("failed to set nonblocking mode on console"); 24124386e31Sdv return (1); 24224386e31Sdv } 24324386e31Sdv 24424386e31Sdv /* 24524386e31Sdv * We now let the vmm process know we were successful by sending it our 24624386e31Sdv * vmm(4) assigned vm id. 24724386e31Sdv */ 24824386e31Sdv if (atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) != 24924386e31Sdv sizeof(vcp->vcp_id)) { 25024386e31Sdv log_warn("failed to send created vm id to vmm process"); 25124386e31Sdv return (1); 25224386e31Sdv } 25324386e31Sdv 25424386e31Sdv /* Prepare either our boot image or receive an existing vm to launch. */ 25519700f36Sjasper if (vm->vm_state & VM_STATE_RECEIVED) { 256fbbcf6cdSdv ret = atomicio(read, vm->vm_receive_fd, &vrp, sizeof(vrp)); 257fbbcf6cdSdv if (ret != sizeof(vrp)) 258eed20f3bSpd fatal("received incomplete vrp - exiting"); 259eed20f3bSpd vrs = vrp.vrwp_regs; 260c4fd4c5bSdv } else if (load_firmware(vm, &vrs)) 261c4fd4c5bSdv fatalx("failed to load kernel or firmware image"); 2621e1977eeSreyk 2631e1977eeSreyk if (vm->vm_kernel != -1) 26424386e31Sdv close_fd(vm->vm_kernel); 2651e1977eeSreyk 26624386e31Sdv /* Initialize our mutexes. */ 26724386e31Sdv ret = pthread_mutex_init(&threadmutex, NULL); 26824386e31Sdv if (ret) { 26924386e31Sdv log_warn("%s: could not initialize thread state mutex", 27024386e31Sdv __func__); 27124386e31Sdv return (ret); 27224386e31Sdv } 27324386e31Sdv ret = pthread_cond_init(&threadcond, NULL); 27424386e31Sdv if (ret) { 27524386e31Sdv log_warn("%s: could not initialize thread state " 27624386e31Sdv "condition variable", __func__); 27724386e31Sdv return (ret); 27824386e31Sdv } 2795195cf3eSdv ret = pthread_mutex_init(&vm_mtx, NULL); 2805195cf3eSdv if (ret) { 2815195cf3eSdv log_warn("%s: could not initialize vm state mutex", 2825195cf3eSdv __func__); 2835195cf3eSdv return (ret); 2845195cf3eSdv } 2851e1977eeSreyk 2865195cf3eSdv /* Lock thread mutex now. It's unlocked when waiting on threadcond. */ 2875195cf3eSdv mutex_lock(&threadmutex); 2881e1977eeSreyk 28924386e31Sdv /* 29024386e31Sdv * Finalize our communication socket with the vmm process. From here 29124386e31Sdv * onwards, communication with the vmm process is event-based. 29224386e31Sdv */ 2939d3767a2Smlarkin event_init(); 29424386e31Sdv if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1) 29524386e31Sdv fatal("setup vm pipe"); 29624386e31Sdv 29724386e31Sdv /* 29824386e31Sdv * Initialize or restore our emulated hardware. 29924386e31Sdv */ 30024386e31Sdv for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) 30124386e31Sdv nicfds[i] = vm->vm_ifs[i].vif_fd; 3021e1977eeSreyk 30319700f36Sjasper if (vm->vm_state & VM_STATE_RECEIVED) { 3043481ecdfSdv restore_mem(vm->vm_receive_fd, vcp); 305eed20f3bSpd restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds, 30695ab188fSccardenas vm->vm_disks, vm->vm_cdrom); 307cbd2a590Spd if (restore_vm_params(vm->vm_receive_fd, vcp)) 308cbd2a590Spd fatal("restore vm params failed"); 30973a98491Sdv unpause_vm(vm); 31024386e31Sdv } else 31124386e31Sdv init_emulated_hw(vmc, vm->vm_cdrom, vm->vm_disks, nicfds); 312eed20f3bSpd 3133481ecdfSdv /* Drop privleges further before starting the vcpu run loop(s). */ 3143481ecdfSdv if (pledge("stdio vmm recvfd", NULL) == -1) 3153481ecdfSdv fatal("pledge"); 3163481ecdfSdv 31724386e31Sdv /* 31824386e31Sdv * Execute the vcpu run loop(s) for this VM. 31924386e31Sdv */ 32024386e31Sdv ret = run_vm(&vm->vm_params, &vrs); 3211e1977eeSreyk 322f4b47ae8Sbluhm /* Shutdown SEV. */ 323f4b47ae8Sbluhm if (sev_shutdown(vm)) 324f4b47ae8Sbluhm log_warnx("%s: could not shutdown SEV", __func__); 325f4b47ae8Sbluhm 32650bebf2cSccardenas /* Ensure that any in-flight data is written back */ 32750bebf2cSccardenas virtio_shutdown(vm); 32850bebf2cSccardenas 3291e1977eeSreyk return (ret); 3301e1977eeSreyk } 3311e1977eeSreyk 3321e1977eeSreyk /* 3331e1977eeSreyk * vm_dispatch_vmm 3341e1977eeSreyk * 3351e1977eeSreyk * imsg callback for messages that are received from the vmm parent process. 3361e1977eeSreyk */ 3371e1977eeSreyk void 3381e1977eeSreyk vm_dispatch_vmm(int fd, short event, void *arg) 3391e1977eeSreyk { 3401e1977eeSreyk struct vmd_vm *vm = arg; 34152e954a3Spd struct vmop_result vmr; 34297f33f1dSdv struct vmop_addr_result var; 3431e1977eeSreyk struct imsgev *iev = &vm->vm_iev; 3441e1977eeSreyk struct imsgbuf *ibuf = &iev->ibuf; 3451e1977eeSreyk struct imsg imsg; 3461e1977eeSreyk ssize_t n; 3471e1977eeSreyk int verbose; 3481e1977eeSreyk 3491e1977eeSreyk if (event & EV_READ) { 350*d12ef5f3Sclaudio if ((n = imsgbuf_read(ibuf)) == -1) 351dd7efffeSclaudio fatal("%s: imsgbuf_read", __func__); 3521e1977eeSreyk if (n == 0) 3531e1977eeSreyk _exit(0); 3541e1977eeSreyk } 3551e1977eeSreyk 3561e1977eeSreyk if (event & EV_WRITE) { 357dd7efffeSclaudio if (imsgbuf_write(ibuf) == -1) { 358c1aa9554Sclaudio if (errno == EPIPE) 3591e1977eeSreyk _exit(0); 360dd7efffeSclaudio fatal("%s: imsgbuf_write fd %d", __func__, ibuf->fd); 361c1aa9554Sclaudio } 3621e1977eeSreyk } 3631e1977eeSreyk 3641e1977eeSreyk for (;;) { 3651e1977eeSreyk if ((n = imsg_get(ibuf, &imsg)) == -1) 3661e1977eeSreyk fatal("%s: imsg_get", __func__); 3671e1977eeSreyk if (n == 0) 3681e1977eeSreyk break; 3691e1977eeSreyk 3701e1977eeSreyk #if DEBUG > 1 3711e1977eeSreyk log_debug("%s: got imsg %d from %s", 3721e1977eeSreyk __func__, imsg.hdr.type, 3731e1977eeSreyk vm->vm_params.vmc_params.vcp_name); 3741e1977eeSreyk #endif 3751e1977eeSreyk 3761e1977eeSreyk switch (imsg.hdr.type) { 3771e1977eeSreyk case IMSG_CTL_VERBOSE: 3781e1977eeSreyk IMSG_SIZE_CHECK(&imsg, &verbose); 3791e1977eeSreyk memcpy(&verbose, imsg.data, sizeof(verbose)); 3801e1977eeSreyk log_setverbose(verbose); 38108d0da61Sdv virtio_broadcast_imsg(vm, IMSG_CTL_VERBOSE, &verbose, 38208d0da61Sdv sizeof(verbose)); 3831e1977eeSreyk break; 3841e1977eeSreyk case IMSG_VMDOP_VM_SHUTDOWN: 3851e1977eeSreyk if (vmmci_ctl(VMMCI_SHUTDOWN) == -1) 3861e1977eeSreyk _exit(0); 3871e1977eeSreyk break; 3881e1977eeSreyk case IMSG_VMDOP_VM_REBOOT: 3891e1977eeSreyk if (vmmci_ctl(VMMCI_REBOOT) == -1) 3901e1977eeSreyk _exit(0); 3911e1977eeSreyk break; 39252e954a3Spd case IMSG_VMDOP_PAUSE_VM: 39352e954a3Spd vmr.vmr_result = 0; 39452e954a3Spd vmr.vmr_id = vm->vm_vmid; 39573a98491Sdv pause_vm(vm); 39652e954a3Spd imsg_compose_event(&vm->vm_iev, 39752e954a3Spd IMSG_VMDOP_PAUSE_VM_RESPONSE, 39852e954a3Spd imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 39952e954a3Spd sizeof(vmr)); 40052e954a3Spd break; 40152e954a3Spd case IMSG_VMDOP_UNPAUSE_VM: 40252e954a3Spd vmr.vmr_result = 0; 40352e954a3Spd vmr.vmr_id = vm->vm_vmid; 40473a98491Sdv unpause_vm(vm); 40552e954a3Spd imsg_compose_event(&vm->vm_iev, 40652e954a3Spd IMSG_VMDOP_UNPAUSE_VM_RESPONSE, 40752e954a3Spd imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 40852e954a3Spd sizeof(vmr)); 40952e954a3Spd break; 410eed20f3bSpd case IMSG_VMDOP_SEND_VM_REQUEST: 411eed20f3bSpd vmr.vmr_id = vm->vm_vmid; 41253027660Sclaudio vmr.vmr_result = send_vm(imsg_get_fd(&imsg), vm); 413eed20f3bSpd imsg_compose_event(&vm->vm_iev, 414eed20f3bSpd IMSG_VMDOP_SEND_VM_RESPONSE, 415eed20f3bSpd imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 416eed20f3bSpd sizeof(vmr)); 417a31b2e6bSpd if (!vmr.vmr_result) { 418dd7efffeSclaudio imsgbuf_flush(¤t_vm->vm_iev.ibuf); 419a31b2e6bSpd _exit(0); 420a31b2e6bSpd } 421eed20f3bSpd break; 42297f33f1dSdv case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE: 42397f33f1dSdv IMSG_SIZE_CHECK(&imsg, &var); 42497f33f1dSdv memcpy(&var, imsg.data, sizeof(var)); 42597f33f1dSdv 42697f33f1dSdv log_debug("%s: received tap addr %s for nic %d", 42797f33f1dSdv vm->vm_params.vmc_params.vcp_name, 42897f33f1dSdv ether_ntoa((void *)var.var_addr), var.var_nic_idx); 42997f33f1dSdv 43097f33f1dSdv vionet_set_hostmac(vm, var.var_nic_idx, var.var_addr); 43197f33f1dSdv break; 4321e1977eeSreyk default: 4331e1977eeSreyk fatalx("%s: got invalid imsg %d from %s", 4341e1977eeSreyk __func__, imsg.hdr.type, 4351e1977eeSreyk vm->vm_params.vmc_params.vcp_name); 4361e1977eeSreyk } 4371e1977eeSreyk imsg_free(&imsg); 4381e1977eeSreyk } 4391e1977eeSreyk imsg_event_add(iev); 4401e1977eeSreyk } 4411e1977eeSreyk 4421e1977eeSreyk /* 4437d0a6c3dSmlarkin * vm_shutdown 4443320a88dSreyk * 4453320a88dSreyk * Tell the vmm parent process to shutdown or reboot the VM and exit. 4463320a88dSreyk */ 4473320a88dSreyk __dead void 4483320a88dSreyk vm_shutdown(unsigned int cmd) 4493320a88dSreyk { 4503320a88dSreyk switch (cmd) { 4513320a88dSreyk case VMMCI_NONE: 4523320a88dSreyk case VMMCI_SHUTDOWN: 4535bbb2f6eSreyk (void)imsg_compose_event(¤t_vm->vm_iev, 4545bbb2f6eSreyk IMSG_VMDOP_VM_SHUTDOWN, 0, 0, -1, NULL, 0); 4553320a88dSreyk break; 4563320a88dSreyk case VMMCI_REBOOT: 4575bbb2f6eSreyk (void)imsg_compose_event(¤t_vm->vm_iev, 4585bbb2f6eSreyk IMSG_VMDOP_VM_REBOOT, 0, 0, -1, NULL, 0); 4593320a88dSreyk break; 4603320a88dSreyk default: 4613320a88dSreyk fatalx("invalid vm ctl command: %d", cmd); 4623320a88dSreyk } 463dd7efffeSclaudio imsgbuf_flush(¤t_vm->vm_iev.ibuf); 4643320a88dSreyk 465f4b47ae8Sbluhm if (sev_shutdown(current_vm)) 466f4b47ae8Sbluhm log_warnx("%s: could not shutdown SEV", __func__); 467f4b47ae8Sbluhm 4683320a88dSreyk _exit(0); 4693320a88dSreyk } 4703320a88dSreyk 471eed20f3bSpd int 47273a98491Sdv send_vm(int fd, struct vmd_vm *vm) 473eed20f3bSpd { 474eed20f3bSpd struct vm_rwregs_params vrp; 475cbd2a590Spd struct vm_rwvmparams_params vpp; 476eed20f3bSpd struct vmop_create_params *vmc; 477eed20f3bSpd struct vm_terminate_params vtp; 478eed20f3bSpd unsigned int flags = 0; 479eed20f3bSpd unsigned int i; 480eed20f3bSpd int ret = 0; 48168482e07Smlarkin size_t sz; 482eed20f3bSpd 4832d671a23Spd if (dump_send_header(fd)) { 4843481ecdfSdv log_warnx("%s: failed to send vm dump header", __func__); 485eed20f3bSpd goto err; 486eed20f3bSpd } 487eed20f3bSpd 48873a98491Sdv pause_vm(vm); 489eed20f3bSpd 490eed20f3bSpd vmc = calloc(1, sizeof(struct vmop_create_params)); 491eed20f3bSpd if (vmc == NULL) { 4923a50f0a9Sjmc log_warn("%s: calloc error getting vmc", __func__); 493eed20f3bSpd ret = -1; 494eed20f3bSpd goto err; 495eed20f3bSpd } 496eed20f3bSpd 497eed20f3bSpd flags |= VMOP_CREATE_MEMORY; 498eed20f3bSpd memcpy(&vmc->vmc_params, ¤t_vm->vm_params, sizeof(struct 499eed20f3bSpd vmop_create_params)); 500eed20f3bSpd vmc->vmc_flags = flags; 50173a98491Sdv vrp.vrwp_vm_id = vm->vm_params.vmc_params.vcp_id; 502eed20f3bSpd vrp.vrwp_mask = VM_RWREGS_ALL; 503cbd2a590Spd vpp.vpp_mask = VM_RWVMPARAMS_ALL; 50473a98491Sdv vpp.vpp_vm_id = vm->vm_params.vmc_params.vcp_id; 505eed20f3bSpd 50668482e07Smlarkin sz = atomicio(vwrite, fd, vmc, sizeof(struct vmop_create_params)); 50768482e07Smlarkin if (sz != sizeof(struct vmop_create_params)) { 50868482e07Smlarkin ret = -1; 509eed20f3bSpd goto err; 51068482e07Smlarkin } 511eed20f3bSpd 51273a98491Sdv for (i = 0; i < vm->vm_params.vmc_params.vcp_ncpus; i++) { 513eed20f3bSpd vrp.vrwp_vcpu_id = i; 514eed20f3bSpd if ((ret = ioctl(env->vmd_fd, VMM_IOC_READREGS, &vrp))) { 515eed20f3bSpd log_warn("%s: readregs failed", __func__); 516eed20f3bSpd goto err; 517eed20f3bSpd } 51868482e07Smlarkin 51968482e07Smlarkin sz = atomicio(vwrite, fd, &vrp, 52068482e07Smlarkin sizeof(struct vm_rwregs_params)); 52168482e07Smlarkin if (sz != sizeof(struct vm_rwregs_params)) { 522eed20f3bSpd log_warn("%s: dumping registers failed", __func__); 52368482e07Smlarkin ret = -1; 524eed20f3bSpd goto err; 525eed20f3bSpd } 526eed20f3bSpd } 527eed20f3bSpd 5283481ecdfSdv /* Dump memory before devices to aid in restoration. */ 5293481ecdfSdv if ((ret = dump_mem(fd, vm))) 5303481ecdfSdv goto err; 531c4fd4c5bSdv if ((ret = dump_devs(fd))) 532622c1441Sclaudio goto err; 533813e3047Spd if ((ret = pci_dump(fd))) 534813e3047Spd goto err; 535eed20f3bSpd if ((ret = virtio_dump(fd))) 536eed20f3bSpd goto err; 537eed20f3bSpd 53873a98491Sdv for (i = 0; i < vm->vm_params.vmc_params.vcp_ncpus; i++) { 539cbd2a590Spd vpp.vpp_vcpu_id = i; 540cbd2a590Spd if ((ret = ioctl(env->vmd_fd, VMM_IOC_READVMPARAMS, &vpp))) { 541cbd2a590Spd log_warn("%s: readvmparams failed", __func__); 542cbd2a590Spd goto err; 543cbd2a590Spd } 544cbd2a590Spd 545cbd2a590Spd sz = atomicio(vwrite, fd, &vpp, 546cbd2a590Spd sizeof(struct vm_rwvmparams_params)); 547cbd2a590Spd if (sz != sizeof(struct vm_rwvmparams_params)) { 548cbd2a590Spd log_warn("%s: dumping vm params failed", __func__); 549cbd2a590Spd ret = -1; 550cbd2a590Spd goto err; 551cbd2a590Spd } 552cbd2a590Spd } 553cbd2a590Spd 55473a98491Sdv vtp.vtp_vm_id = vm->vm_params.vmc_params.vcp_id; 555df69c215Sderaadt if (ioctl(env->vmd_fd, VMM_IOC_TERM, &vtp) == -1) { 556eed20f3bSpd log_warnx("%s: term IOC error: %d, %d", __func__, 557eed20f3bSpd errno, ENOENT); 558eed20f3bSpd } 559eed20f3bSpd err: 560eed20f3bSpd close(fd); 561eed20f3bSpd if (ret) 56273a98491Sdv unpause_vm(vm); 563eed20f3bSpd return ret; 564eed20f3bSpd } 565eed20f3bSpd 566eed20f3bSpd int 56773a98491Sdv dump_mem(int fd, struct vmd_vm *vm) 568eed20f3bSpd { 569eed20f3bSpd unsigned int i; 570eed20f3bSpd int ret; 571eed20f3bSpd struct vm_mem_range *vmr; 572eed20f3bSpd 57373a98491Sdv for (i = 0; i < vm->vm_params.vmc_params.vcp_nmemranges; i++) { 57473a98491Sdv vmr = &vm->vm_params.vmc_params.vcp_memranges[i]; 575eed20f3bSpd ret = dump_vmr(fd, vmr); 576eed20f3bSpd if (ret) 577eed20f3bSpd return ret; 578eed20f3bSpd } 579eed20f3bSpd return (0); 580eed20f3bSpd } 581eed20f3bSpd 582cbd2a590Spd int 583cbd2a590Spd restore_vm_params(int fd, struct vm_create_params *vcp) { 584cbd2a590Spd unsigned int i; 585cbd2a590Spd struct vm_rwvmparams_params vpp; 586cbd2a590Spd 587cbd2a590Spd for (i = 0; i < vcp->vcp_ncpus; i++) { 588cbd2a590Spd if (atomicio(read, fd, &vpp, sizeof(vpp)) != sizeof(vpp)) { 589cbd2a590Spd log_warn("%s: error restoring vm params", __func__); 590cbd2a590Spd return (-1); 591cbd2a590Spd } 592cbd2a590Spd vpp.vpp_vm_id = vcp->vcp_id; 593cbd2a590Spd vpp.vpp_vcpu_id = i; 594cbd2a590Spd if (ioctl(env->vmd_fd, VMM_IOC_WRITEVMPARAMS, &vpp) < 0) { 595cbd2a590Spd log_debug("%s: writing vm params failed", __func__); 596cbd2a590Spd return (-1); 597cbd2a590Spd } 598cbd2a590Spd } 599cbd2a590Spd return (0); 600cbd2a590Spd } 601cbd2a590Spd 602eed20f3bSpd void 603eed20f3bSpd restore_mem(int fd, struct vm_create_params *vcp) 604eed20f3bSpd { 605eed20f3bSpd unsigned int i; 606eed20f3bSpd struct vm_mem_range *vmr; 607eed20f3bSpd 608eed20f3bSpd for (i = 0; i < vcp->vcp_nmemranges; i++) { 609eed20f3bSpd vmr = &vcp->vcp_memranges[i]; 610eed20f3bSpd restore_vmr(fd, vmr); 611eed20f3bSpd } 612eed20f3bSpd } 613eed20f3bSpd 614eed20f3bSpd int 615eed20f3bSpd dump_vmr(int fd, struct vm_mem_range *vmr) 616eed20f3bSpd { 617eed20f3bSpd size_t rem = vmr->vmr_size, read=0; 618eed20f3bSpd char buf[PAGE_SIZE]; 619eed20f3bSpd 620eed20f3bSpd while (rem > 0) { 621eed20f3bSpd if (read_mem(vmr->vmr_gpa + read, buf, PAGE_SIZE)) { 622eed20f3bSpd log_warn("failed to read vmr"); 623eed20f3bSpd return (-1); 624eed20f3bSpd } 625eed20f3bSpd if (atomicio(vwrite, fd, buf, sizeof(buf)) != sizeof(buf)) { 626eed20f3bSpd log_warn("failed to dump vmr"); 627eed20f3bSpd return (-1); 628eed20f3bSpd } 629eed20f3bSpd rem = rem - PAGE_SIZE; 630eed20f3bSpd read = read + PAGE_SIZE; 631eed20f3bSpd } 632eed20f3bSpd return (0); 633eed20f3bSpd } 634eed20f3bSpd 635eed20f3bSpd void 636eed20f3bSpd restore_vmr(int fd, struct vm_mem_range *vmr) 637eed20f3bSpd { 638eed20f3bSpd size_t rem = vmr->vmr_size, wrote=0; 639eed20f3bSpd char buf[PAGE_SIZE]; 640eed20f3bSpd 641eed20f3bSpd while (rem > 0) { 642eed20f3bSpd if (atomicio(read, fd, buf, sizeof(buf)) != sizeof(buf)) 643eed20f3bSpd fatal("failed to restore vmr"); 644eed20f3bSpd if (write_mem(vmr->vmr_gpa + wrote, buf, PAGE_SIZE)) 645eed20f3bSpd fatal("failed to write vmr"); 646eed20f3bSpd rem = rem - PAGE_SIZE; 647eed20f3bSpd wrote = wrote + PAGE_SIZE; 648eed20f3bSpd } 649eed20f3bSpd } 650eed20f3bSpd 65173a98491Sdv static void 65273a98491Sdv pause_vm(struct vmd_vm *vm) 65352e954a3Spd { 654548054a9Spd unsigned int n; 655548054a9Spd int ret; 65652e954a3Spd 6575195cf3eSdv mutex_lock(&vm_mtx); 6585195cf3eSdv if (vm->vm_state & VM_STATE_PAUSED) { 6595195cf3eSdv mutex_unlock(&vm_mtx); 6605195cf3eSdv return; 6615195cf3eSdv } 66219700f36Sjasper current_vm->vm_state |= VM_STATE_PAUSED; 6635195cf3eSdv mutex_unlock(&vm_mtx); 66452e954a3Spd 66573a98491Sdv ret = pthread_barrier_init(&vm_pause_barrier, NULL, 66673a98491Sdv vm->vm_params.vmc_params.vcp_ncpus + 1); 667548054a9Spd if (ret) { 668ad9e848cSpd log_warnx("%s: cannot initialize pause barrier (%d)", 669ad9e848cSpd __progname, ret); 670548054a9Spd return; 671548054a9Spd } 672548054a9Spd 67373a98491Sdv for (n = 0; n < vm->vm_params.vmc_params.vcp_ncpus; n++) { 674548054a9Spd ret = pthread_cond_broadcast(&vcpu_run_cond[n]); 675548054a9Spd if (ret) { 676548054a9Spd log_warnx("%s: can't broadcast vcpu run cond (%d)", 677548054a9Spd __func__, (int)ret); 678548054a9Spd return; 679548054a9Spd } 680ad9e848cSpd } 681ad9e848cSpd ret = pthread_barrier_wait(&vm_pause_barrier); 682ad9e848cSpd if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) { 683ad9e848cSpd log_warnx("%s: could not wait on pause barrier (%d)", 684ad9e848cSpd __func__, (int)ret); 685ad9e848cSpd return; 686ad9e848cSpd } 687548054a9Spd 688ad9e848cSpd ret = pthread_barrier_destroy(&vm_pause_barrier); 689548054a9Spd if (ret) { 690ad9e848cSpd log_warnx("%s: could not destroy pause barrier (%d)", 691ad9e848cSpd __progname, ret); 692548054a9Spd return; 693548054a9Spd } 69452e954a3Spd 695c4fd4c5bSdv pause_vm_md(vm); 69652e954a3Spd } 69752e954a3Spd 69873a98491Sdv static void 69973a98491Sdv unpause_vm(struct vmd_vm *vm) 70052e954a3Spd { 70152e954a3Spd unsigned int n; 702548054a9Spd int ret; 70352e954a3Spd 7045195cf3eSdv mutex_lock(&vm_mtx); 7055195cf3eSdv if (!(vm->vm_state & VM_STATE_PAUSED)) { 7065195cf3eSdv mutex_unlock(&vm_mtx); 7075195cf3eSdv return; 7085195cf3eSdv } 70919700f36Sjasper current_vm->vm_state &= ~VM_STATE_PAUSED; 7105195cf3eSdv mutex_unlock(&vm_mtx); 7115195cf3eSdv 71273a98491Sdv for (n = 0; n < vm->vm_params.vmc_params.vcp_ncpus; n++) { 713548054a9Spd ret = pthread_cond_broadcast(&vcpu_unpause_cond[n]); 714548054a9Spd if (ret) { 715548054a9Spd log_warnx("%s: can't broadcast vcpu unpause cond (%d)", 716548054a9Spd __func__, (int)ret); 717548054a9Spd return; 718548054a9Spd } 719548054a9Spd } 72052e954a3Spd 721c4fd4c5bSdv unpause_vm_md(vm); 72252e954a3Spd } 72352e954a3Spd 7243320a88dSreyk /* 7251e1977eeSreyk * vcpu_reset 7261e1977eeSreyk * 7271e1977eeSreyk * Requests vmm(4) to reset the VCPUs in the indicated VM to 7281e1977eeSreyk * the register state provided 7291e1977eeSreyk * 7301e1977eeSreyk * Parameters 7311e1977eeSreyk * vmid: VM ID to reset 7321e1977eeSreyk * vcpu_id: VCPU ID to reset 7331e1977eeSreyk * vrs: the register state to initialize 7341e1977eeSreyk * 7351e1977eeSreyk * Return values: 7361e1977eeSreyk * 0: success 7371e1977eeSreyk * !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not 7381e1977eeSreyk * valid) 7391e1977eeSreyk */ 7401e1977eeSreyk int 7411e1977eeSreyk vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs) 7421e1977eeSreyk { 7431e1977eeSreyk struct vm_resetcpu_params vrp; 7441e1977eeSreyk 7451e1977eeSreyk memset(&vrp, 0, sizeof(vrp)); 7461e1977eeSreyk vrp.vrp_vm_id = vmid; 7471e1977eeSreyk vrp.vrp_vcpu_id = vcpu_id; 7481e1977eeSreyk memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state)); 7491e1977eeSreyk 7501e1977eeSreyk log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid); 7511e1977eeSreyk 752df69c215Sderaadt if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) == -1) 7531e1977eeSreyk return (errno); 7541e1977eeSreyk 7551e1977eeSreyk return (0); 7561e1977eeSreyk } 7571e1977eeSreyk 7581e1977eeSreyk /* 7591e1977eeSreyk * alloc_guest_mem 7601e1977eeSreyk * 7611e1977eeSreyk * Allocates memory for the guest. 7621e1977eeSreyk * Instead of doing a single allocation with one mmap(), we allocate memory 7631e1977eeSreyk * separately for every range for the following reasons: 7641e1977eeSreyk * - ASLR for the individual ranges 7651e1977eeSreyk * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to 7661e1977eeSreyk * map the single mmap'd userspace memory to the individual guest physical 7671e1977eeSreyk * memory ranges, the underlying amap of the single mmap'd range would have 7681e1977eeSreyk * to allocate per-page reference counters. The reason is that the 7691e1977eeSreyk * individual guest physical ranges would reference the single mmap'd region 7701e1977eeSreyk * only partially. However, if every guest physical range has its own 7711e1977eeSreyk * corresponding mmap'd userspace allocation, there are no partial 7721e1977eeSreyk * references: every guest physical range fully references an mmap'd 7731e1977eeSreyk * range => no per-page reference counters have to be allocated. 7741e1977eeSreyk * 7751e1977eeSreyk * Return values: 7761e1977eeSreyk * 0: success 7771e1977eeSreyk * !0: failure - errno indicating the source of the failure 7781e1977eeSreyk */ 7791e1977eeSreyk int 7803481ecdfSdv alloc_guest_mem(struct vmd_vm *vm) 7811e1977eeSreyk { 7821e1977eeSreyk void *p; 7833c817da7Sdv int ret = 0; 7841e1977eeSreyk size_t i, j; 7853481ecdfSdv struct vm_create_params *vcp = &vm->vm_params.vmc_params; 7861e1977eeSreyk struct vm_mem_range *vmr; 7871e1977eeSreyk 7881e1977eeSreyk for (i = 0; i < vcp->vcp_nmemranges; i++) { 7891e1977eeSreyk vmr = &vcp->vcp_memranges[i]; 7903481ecdfSdv 7913c817da7Sdv /* 7923c817da7Sdv * We only need R/W as userland. vmm(4) will use R/W/X in its 7933c817da7Sdv * mapping. 7943c817da7Sdv * 7953c817da7Sdv * We must use MAP_SHARED so emulated devices will be able 7963c817da7Sdv * to generate shared mappings. 7973c817da7Sdv */ 7981e1977eeSreyk p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE, 7993c817da7Sdv MAP_ANON | MAP_CONCEAL | MAP_SHARED, -1, 0); 8001e1977eeSreyk if (p == MAP_FAILED) { 8011e1977eeSreyk ret = errno; 8021e1977eeSreyk for (j = 0; j < i; j++) { 8031e1977eeSreyk vmr = &vcp->vcp_memranges[j]; 8041e1977eeSreyk munmap((void *)vmr->vmr_va, vmr->vmr_size); 8051e1977eeSreyk } 8063c817da7Sdv return (ret); 8071e1977eeSreyk } 8081e1977eeSreyk vmr->vmr_va = (vaddr_t)p; 8091e1977eeSreyk } 8103c817da7Sdv 8113481ecdfSdv return (ret); 8121e1977eeSreyk } 8131e1977eeSreyk 8141e1977eeSreyk /* 8151e1977eeSreyk * vmm_create_vm 8161e1977eeSreyk * 8171e1977eeSreyk * Requests vmm(4) to create a new VM using the supplied creation 8181e1977eeSreyk * parameters. This operation results in the creation of the in-kernel 8191e1977eeSreyk * structures for the VM, but does not start the VM's vcpu(s). 8201e1977eeSreyk * 8211e1977eeSreyk * Parameters: 82273a98491Sdv * vm: pointer to the vm object 8231e1977eeSreyk * 8241e1977eeSreyk * Return values: 8251e1977eeSreyk * 0: success 8261e1977eeSreyk * !0 : ioctl to vmm(4) failed 8271e1977eeSreyk */ 82873a98491Sdv static int 82973a98491Sdv vmm_create_vm(struct vmd_vm *vm) 8301e1977eeSreyk { 83173a98491Sdv struct vm_create_params *vcp = &vm->vm_params.vmc_params; 832f4b47ae8Sbluhm size_t i; 83373a98491Sdv 8341e1977eeSreyk /* Sanity check arguments */ 8351e1977eeSreyk if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) 8361e1977eeSreyk return (EINVAL); 8371e1977eeSreyk 8381e1977eeSreyk if (vcp->vcp_nmemranges == 0 || 8391e1977eeSreyk vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES) 8401e1977eeSreyk return (EINVAL); 8411e1977eeSreyk 84273a98491Sdv if (vm->vm_params.vmc_ndisks > VM_MAX_DISKS_PER_VM) 8431e1977eeSreyk return (EINVAL); 8441e1977eeSreyk 84573a98491Sdv if (vm->vm_params.vmc_nnics > VM_MAX_NICS_PER_VM) 8461e1977eeSreyk return (EINVAL); 8471e1977eeSreyk 848df69c215Sderaadt if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) == -1) 8491e1977eeSreyk return (errno); 8501e1977eeSreyk 851f4b47ae8Sbluhm for (i = 0; i < vcp->vcp_ncpus; i++) 852f4b47ae8Sbluhm vm->vm_sev_asid[i] = vcp->vcp_asid[i]; 853f4b47ae8Sbluhm 8541e1977eeSreyk return (0); 8551e1977eeSreyk } 8561e1977eeSreyk 8571e1977eeSreyk 8581e1977eeSreyk /* 8591e1977eeSreyk * run_vm 8601e1977eeSreyk * 8611e1977eeSreyk * Runs the VM whose creation parameters are specified in vcp 8621e1977eeSreyk * 8631e1977eeSreyk * Parameters: 86495ab188fSccardenas * child_cdrom: previously-opened child ISO disk file descriptor 8651e1977eeSreyk * child_disks: previously-opened child VM disk file file descriptors 8661e1977eeSreyk * child_taps: previously-opened child tap file descriptors 8672b2a5f0dSreyk * vmc: vmop_create_params struct containing the VM's desired creation 8681e1977eeSreyk * configuration 8691e1977eeSreyk * vrs: VCPU register state to initialize 8701e1977eeSreyk * 8711e1977eeSreyk * Return values: 8721e1977eeSreyk * 0: the VM exited normally 8731e1977eeSreyk * !0 : the VM exited abnormally or failed to start 8741e1977eeSreyk */ 87524386e31Sdv static int 87624386e31Sdv run_vm(struct vmop_create_params *vmc, struct vcpu_reg_state *vrs) 8771e1977eeSreyk { 8782b2a5f0dSreyk struct vm_create_params *vcp = &vmc->vmc_params; 879eed20f3bSpd struct vm_rwregs_params vregsp; 8801e1977eeSreyk uint8_t evdone = 0; 8811e1977eeSreyk size_t i; 8821e1977eeSreyk int ret; 8831e1977eeSreyk pthread_t *tid, evtid; 884ad00e8c1Sdv char tname[MAXCOMLEN + 1]; 8851e1977eeSreyk struct vm_run_params **vrp; 8861e1977eeSreyk void *exit_status; 8871e1977eeSreyk 8881e1977eeSreyk if (vcp == NULL) 8891e1977eeSreyk return (EINVAL); 8901e1977eeSreyk 8911e1977eeSreyk if (vcp->vcp_nmemranges == 0 || 8921e1977eeSreyk vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES) 8931e1977eeSreyk return (EINVAL); 8941e1977eeSreyk 8951e1977eeSreyk tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t)); 8961e1977eeSreyk vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *)); 8971e1977eeSreyk if (tid == NULL || vrp == NULL) { 8981e1977eeSreyk log_warn("%s: memory allocation error - exiting.", 8991e1977eeSreyk __progname); 9001e1977eeSreyk return (ENOMEM); 9011e1977eeSreyk } 9021e1977eeSreyk 90324386e31Sdv log_debug("%s: starting %zu vcpu thread(s) for vm %s", __func__, 90424386e31Sdv vcp->vcp_ncpus, vcp->vcp_name); 9051e1977eeSreyk 9061e1977eeSreyk /* 9071e1977eeSreyk * Create and launch one thread for each VCPU. These threads may 9081e1977eeSreyk * migrate between PCPUs over time; the need to reload CPU state 9091e1977eeSreyk * in such situations is detected and performed by vmm(4) in the 9101e1977eeSreyk * kernel. 9111e1977eeSreyk */ 9121e1977eeSreyk for (i = 0 ; i < vcp->vcp_ncpus; i++) { 9131e1977eeSreyk vrp[i] = malloc(sizeof(struct vm_run_params)); 9141e1977eeSreyk if (vrp[i] == NULL) { 9151e1977eeSreyk log_warn("%s: memory allocation error - " 9161e1977eeSreyk "exiting.", __progname); 9176ad82823Smlarkin /* caller will exit, so skip freeing */ 9181e1977eeSreyk return (ENOMEM); 9191e1977eeSreyk } 92002ee787fSmlarkin vrp[i]->vrp_exit = malloc(sizeof(struct vm_exit)); 9211e1977eeSreyk if (vrp[i]->vrp_exit == NULL) { 9221e1977eeSreyk log_warn("%s: memory allocation error - " 9231e1977eeSreyk "exiting.", __progname); 9246ad82823Smlarkin /* caller will exit, so skip freeing */ 9251e1977eeSreyk return (ENOMEM); 9261e1977eeSreyk } 9271e1977eeSreyk vrp[i]->vrp_vm_id = vcp->vcp_id; 9281e1977eeSreyk vrp[i]->vrp_vcpu_id = i; 9291e1977eeSreyk 9301e1977eeSreyk if (vcpu_reset(vcp->vcp_id, i, vrs)) { 9311e1977eeSreyk log_warnx("%s: cannot reset VCPU %zu - exiting.", 9321e1977eeSreyk __progname, i); 9331e1977eeSreyk return (EIO); 9341e1977eeSreyk } 9351e1977eeSreyk 936f4b47ae8Sbluhm if (sev_activate(current_vm, i)) { 937f4b47ae8Sbluhm log_warnx("%s: SEV activatation failed for VCPU " 938f4b47ae8Sbluhm "%zu failed - exiting.", __progname, i); 939f4b47ae8Sbluhm return (EIO); 940f4b47ae8Sbluhm } 941f4b47ae8Sbluhm 942f4b47ae8Sbluhm if (sev_encrypt_memory(current_vm)) { 943f4b47ae8Sbluhm log_warnx("%s: memory encryption failed for VCPU " 944f4b47ae8Sbluhm "%zu failed - exiting.", __progname, i); 945f4b47ae8Sbluhm return (EIO); 946f4b47ae8Sbluhm } 947f4b47ae8Sbluhm 9486ad82823Smlarkin /* once more because reset_cpu changes regs */ 94919700f36Sjasper if (current_vm->vm_state & VM_STATE_RECEIVED) { 950eed20f3bSpd vregsp.vrwp_vm_id = vcp->vcp_id; 951eed20f3bSpd vregsp.vrwp_vcpu_id = i; 952eed20f3bSpd vregsp.vrwp_regs = *vrs; 953eed20f3bSpd vregsp.vrwp_mask = VM_RWREGS_ALL; 954eed20f3bSpd if ((ret = ioctl(env->vmd_fd, VMM_IOC_WRITEREGS, 955df69c215Sderaadt &vregsp)) == -1) { 956eed20f3bSpd log_warn("%s: writeregs failed", __func__); 957eed20f3bSpd return (ret); 958eed20f3bSpd } 959eed20f3bSpd } 960eed20f3bSpd 9611e1977eeSreyk ret = pthread_cond_init(&vcpu_run_cond[i], NULL); 9621e1977eeSreyk if (ret) { 9631e1977eeSreyk log_warnx("%s: cannot initialize cond var (%d)", 9641e1977eeSreyk __progname, ret); 9651e1977eeSreyk return (ret); 9661e1977eeSreyk } 9671e1977eeSreyk 9681e1977eeSreyk ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL); 9691e1977eeSreyk if (ret) { 9701e1977eeSreyk log_warnx("%s: cannot initialize mtx (%d)", 9711e1977eeSreyk __progname, ret); 9721e1977eeSreyk return (ret); 9731e1977eeSreyk } 974548054a9Spd 975548054a9Spd ret = pthread_cond_init(&vcpu_unpause_cond[i], NULL); 976548054a9Spd if (ret) { 977548054a9Spd log_warnx("%s: cannot initialize unpause var (%d)", 978548054a9Spd __progname, ret); 979548054a9Spd return (ret); 980548054a9Spd } 981548054a9Spd 982548054a9Spd ret = pthread_mutex_init(&vcpu_unpause_mtx[i], NULL); 983548054a9Spd if (ret) { 984548054a9Spd log_warnx("%s: cannot initialize unpause mtx (%d)", 985548054a9Spd __progname, ret); 986548054a9Spd return (ret); 987548054a9Spd } 9881e1977eeSreyk 9891e1977eeSreyk vcpu_hlt[i] = 0; 9901e1977eeSreyk 9911e1977eeSreyk /* Start each VCPU run thread at vcpu_run_loop */ 9921e1977eeSreyk ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]); 9931e1977eeSreyk if (ret) { 9941e1977eeSreyk /* caller will _exit after this return */ 9951e1977eeSreyk ret = errno; 9961e1977eeSreyk log_warn("%s: could not create vcpu thread %zu", 9971e1977eeSreyk __func__, i); 9981e1977eeSreyk return (ret); 9991e1977eeSreyk } 1000ad00e8c1Sdv 1001ad00e8c1Sdv snprintf(tname, sizeof(tname), "vcpu-%zu", i); 1002ad00e8c1Sdv pthread_set_name_np(tid[i], tname); 10031e1977eeSreyk } 10041e1977eeSreyk 10051e1977eeSreyk log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name); 10061e1977eeSreyk ret = pthread_create(&evtid, NULL, event_thread, &evdone); 10071e1977eeSreyk if (ret) { 10081e1977eeSreyk errno = ret; 10091e1977eeSreyk log_warn("%s: could not create event thread", __func__); 10101e1977eeSreyk return (ret); 10111e1977eeSreyk } 1012ad00e8c1Sdv pthread_set_name_np(evtid, "event"); 10131e1977eeSreyk 10141e1977eeSreyk for (;;) { 10151e1977eeSreyk ret = pthread_cond_wait(&threadcond, &threadmutex); 10161e1977eeSreyk if (ret) { 10171e1977eeSreyk log_warn("%s: waiting on thread state condition " 10181e1977eeSreyk "variable failed", __func__); 10191e1977eeSreyk return (ret); 10201e1977eeSreyk } 10211e1977eeSreyk 10221e1977eeSreyk /* 10231e1977eeSreyk * Did a VCPU thread exit with an error? => return the first one 10241e1977eeSreyk */ 10255195cf3eSdv mutex_lock(&vm_mtx); 10261e1977eeSreyk for (i = 0; i < vcp->vcp_ncpus; i++) { 10271e1977eeSreyk if (vcpu_done[i] == 0) 10281e1977eeSreyk continue; 10291e1977eeSreyk 10301e1977eeSreyk if (pthread_join(tid[i], &exit_status)) { 10311e1977eeSreyk log_warn("%s: failed to join thread %zd - " 10321e1977eeSreyk "exiting", __progname, i); 10335195cf3eSdv mutex_unlock(&vm_mtx); 10341e1977eeSreyk return (EIO); 10351e1977eeSreyk } 10361e1977eeSreyk 10376a9ef12bSmlarkin ret = (intptr_t)exit_status; 10381e1977eeSreyk } 10395195cf3eSdv mutex_unlock(&vm_mtx); 10401e1977eeSreyk 10411e1977eeSreyk /* Did the event thread exit? => return with an error */ 10421e1977eeSreyk if (evdone) { 10431e1977eeSreyk if (pthread_join(evtid, &exit_status)) { 10441e1977eeSreyk log_warn("%s: failed to join event thread - " 10451e1977eeSreyk "exiting", __progname); 10461e1977eeSreyk return (EIO); 10471e1977eeSreyk } 10481e1977eeSreyk 10491e1977eeSreyk log_warnx("%s: vm %d event thread exited " 10501e1977eeSreyk "unexpectedly", __progname, vcp->vcp_id); 10511e1977eeSreyk return (EIO); 10521e1977eeSreyk } 10531e1977eeSreyk 10541e1977eeSreyk /* Did all VCPU threads exit successfully? => return */ 10555195cf3eSdv mutex_lock(&vm_mtx); 10561e1977eeSreyk for (i = 0; i < vcp->vcp_ncpus; i++) { 10571e1977eeSreyk if (vcpu_done[i] == 0) 10581e1977eeSreyk break; 10591e1977eeSreyk } 10605195cf3eSdv mutex_unlock(&vm_mtx); 10611e1977eeSreyk if (i == vcp->vcp_ncpus) 10621e1977eeSreyk return (ret); 10631e1977eeSreyk 10641e1977eeSreyk /* Some more threads to wait for, start over */ 10651e1977eeSreyk } 10661e1977eeSreyk 10671e1977eeSreyk return (ret); 10681e1977eeSreyk } 10691e1977eeSreyk 1070c4fd4c5bSdv static void * 10711e1977eeSreyk event_thread(void *arg) 10721e1977eeSreyk { 10731e1977eeSreyk uint8_t *donep = arg; 10741e1977eeSreyk intptr_t ret; 10751e1977eeSreyk 10769d3767a2Smlarkin ret = event_dispatch(); 10771e1977eeSreyk 10781e1977eeSreyk *donep = 1; 10795195cf3eSdv 10805195cf3eSdv mutex_lock(&threadmutex); 10811e1977eeSreyk pthread_cond_signal(&threadcond); 10821e1977eeSreyk mutex_unlock(&threadmutex); 10831e1977eeSreyk 10841e1977eeSreyk return (void *)ret; 10851e1977eeSreyk } 10861e1977eeSreyk 10871e1977eeSreyk /* 10881e1977eeSreyk * vcpu_run_loop 10891e1977eeSreyk * 10901e1977eeSreyk * Runs a single VCPU until vmm(4) requires help handling an exit, 10911e1977eeSreyk * or the VM terminates. 10921e1977eeSreyk * 10931e1977eeSreyk * Parameters: 10941e1977eeSreyk * arg: vcpu_run_params for the VCPU being run by this thread 10951e1977eeSreyk * 10961e1977eeSreyk * Return values: 10971e1977eeSreyk * NULL: the VCPU shutdown properly 10981e1977eeSreyk * !NULL: error processing VCPU run, or the VCPU shutdown abnormally 10991e1977eeSreyk */ 1100c4fd4c5bSdv static void * 11011e1977eeSreyk vcpu_run_loop(void *arg) 11021e1977eeSreyk { 11031e1977eeSreyk struct vm_run_params *vrp = (struct vm_run_params *)arg; 11041e1977eeSreyk intptr_t ret = 0; 11055195cf3eSdv uint32_t n = vrp->vrp_vcpu_id; 11065195cf3eSdv int paused = 0, halted = 0; 11071e1977eeSreyk 11081e1977eeSreyk for (;;) { 11091e1977eeSreyk ret = pthread_mutex_lock(&vcpu_run_mtx[n]); 11101e1977eeSreyk 11111e1977eeSreyk if (ret) { 11121e1977eeSreyk log_warnx("%s: can't lock vcpu run mtx (%d)", 11131e1977eeSreyk __func__, (int)ret); 11141e1977eeSreyk return ((void *)ret); 11151e1977eeSreyk } 11161e1977eeSreyk 11175195cf3eSdv mutex_lock(&vm_mtx); 11185195cf3eSdv paused = (current_vm->vm_state & VM_STATE_PAUSED) != 0; 11195195cf3eSdv halted = vcpu_hlt[n]; 11205195cf3eSdv mutex_unlock(&vm_mtx); 11215195cf3eSdv 1122548054a9Spd /* If we are halted and need to pause, pause */ 11235195cf3eSdv if (halted && paused) { 1124ad9e848cSpd ret = pthread_barrier_wait(&vm_pause_barrier); 1125ad9e848cSpd if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) { 1126ad9e848cSpd log_warnx("%s: could not wait on pause barrier (%d)", 1127ad9e848cSpd __func__, (int)ret); 1128548054a9Spd return ((void *)ret); 1129548054a9Spd } 1130548054a9Spd 1131548054a9Spd ret = pthread_mutex_lock(&vcpu_unpause_mtx[n]); 1132548054a9Spd if (ret) { 1133548054a9Spd log_warnx("%s: can't lock vcpu unpause mtx (%d)", 1134548054a9Spd __func__, (int)ret); 1135548054a9Spd return ((void *)ret); 1136548054a9Spd } 1137548054a9Spd 1138c4fd4c5bSdv /* Interrupt may be firing, release run mtx. */ 113971ab85deSdv mutex_unlock(&vcpu_run_mtx[n]); 1140548054a9Spd ret = pthread_cond_wait(&vcpu_unpause_cond[n], 1141548054a9Spd &vcpu_unpause_mtx[n]); 114252e954a3Spd if (ret) { 114352e954a3Spd log_warnx( 1144548054a9Spd "%s: can't wait on unpause cond (%d)", 114552e954a3Spd __func__, (int)ret); 1146548054a9Spd break; 1147548054a9Spd } 114871ab85deSdv mutex_lock(&vcpu_run_mtx[n]); 114971ab85deSdv 1150548054a9Spd ret = pthread_mutex_unlock(&vcpu_unpause_mtx[n]); 1151548054a9Spd if (ret) { 1152548054a9Spd log_warnx("%s: can't unlock unpause mtx (%d)", 1153548054a9Spd __func__, (int)ret); 115452e954a3Spd break; 115552e954a3Spd } 115652e954a3Spd } 1157548054a9Spd 1158548054a9Spd /* If we are halted and not paused, wait */ 11595195cf3eSdv if (halted) { 11601e1977eeSreyk ret = pthread_cond_wait(&vcpu_run_cond[n], 11611e1977eeSreyk &vcpu_run_mtx[n]); 11621e1977eeSreyk 11631e1977eeSreyk if (ret) { 116452e954a3Spd log_warnx( 116552e954a3Spd "%s: can't wait on cond (%d)", 11661e1977eeSreyk __func__, (int)ret); 116752e954a3Spd (void)pthread_mutex_unlock( 116852e954a3Spd &vcpu_run_mtx[n]); 11691e1977eeSreyk break; 11701e1977eeSreyk } 11711e1977eeSreyk } 11721e1977eeSreyk 11731e1977eeSreyk ret = pthread_mutex_unlock(&vcpu_run_mtx[n]); 117452e954a3Spd 11751e1977eeSreyk if (ret) { 11761e1977eeSreyk log_warnx("%s: can't unlock mutex on cond (%d)", 11771e1977eeSreyk __func__, (int)ret); 11781e1977eeSreyk break; 11791e1977eeSreyk } 11801e1977eeSreyk 1181c4fd4c5bSdv if (vrp->vrp_irqready && intr_pending(current_vm)) { 1182c4fd4c5bSdv vrp->vrp_inject.vie_vector = intr_ack(current_vm); 11831ebbcee8Sdv vrp->vrp_inject.vie_type = VCPU_INJECT_INTR; 11841e1977eeSreyk } else 11851ebbcee8Sdv vrp->vrp_inject.vie_type = VCPU_INJECT_NONE; 11861e1977eeSreyk 1187524d607bSdv /* Still more interrupts pending? */ 1188c4fd4c5bSdv vrp->vrp_intr_pending = intr_pending(current_vm); 11898390ed4cSdv 1190df69c215Sderaadt if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) == -1) { 11911e1977eeSreyk /* If run ioctl failed, exit */ 11921e1977eeSreyk ret = errno; 11931e1977eeSreyk log_warn("%s: vm %d / vcpu %d run ioctl failed", 1194e2bf67b2Sdv __func__, current_vm->vm_vmid, n); 11951e1977eeSreyk break; 11961e1977eeSreyk } 11971e1977eeSreyk 11981e1977eeSreyk /* If the VM is terminating, exit normally */ 11991e1977eeSreyk if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) { 12001e1977eeSreyk ret = (intptr_t)NULL; 12011e1977eeSreyk break; 12021e1977eeSreyk } 12031e1977eeSreyk 12041e1977eeSreyk if (vrp->vrp_exit_reason != VM_EXIT_NONE) { 12051e1977eeSreyk /* 12061e1977eeSreyk * vmm(4) needs help handling an exit, handle in 12071e1977eeSreyk * vcpu_exit. 12081e1977eeSreyk */ 12091e1977eeSreyk ret = vcpu_exit(vrp); 12101e1977eeSreyk if (ret) 12111e1977eeSreyk break; 12121e1977eeSreyk } 12131e1977eeSreyk } 12141e1977eeSreyk 12155195cf3eSdv mutex_lock(&vm_mtx); 12161e1977eeSreyk vcpu_done[n] = 1; 12175195cf3eSdv mutex_unlock(&vm_mtx); 12185195cf3eSdv 12195195cf3eSdv mutex_lock(&threadmutex); 12201e1977eeSreyk pthread_cond_signal(&threadcond); 12211e1977eeSreyk mutex_unlock(&threadmutex); 12221e1977eeSreyk 12231e1977eeSreyk return ((void *)ret); 12241e1977eeSreyk } 12251e1977eeSreyk 12261e1977eeSreyk int 1227c4fd4c5bSdv vcpu_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr) 12281e1977eeSreyk { 12291e1977eeSreyk struct vm_intr_params vip; 12301e1977eeSreyk 12311e1977eeSreyk memset(&vip, 0, sizeof(vip)); 12321e1977eeSreyk 12331e1977eeSreyk vip.vip_vm_id = vm_id; 12341e1977eeSreyk vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */ 12351e1977eeSreyk vip.vip_intr = intr; 12361e1977eeSreyk 1237df69c215Sderaadt if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) == -1) 12381e1977eeSreyk return (errno); 12391e1977eeSreyk 12401e1977eeSreyk return (0); 12411e1977eeSreyk } 12421e1977eeSreyk 12431e1977eeSreyk /* 12441e1977eeSreyk * fd_hasdata 12451e1977eeSreyk * 12461e1977eeSreyk * Determines if data can be read from a file descriptor. 12471e1977eeSreyk * 12481e1977eeSreyk * Parameters: 12491e1977eeSreyk * fd: the fd to check 12501e1977eeSreyk * 12511e1977eeSreyk * Return values: 12521e1977eeSreyk * 1 if data can be read from an fd, or 0 otherwise. 12531e1977eeSreyk */ 12541e1977eeSreyk int 12551e1977eeSreyk fd_hasdata(int fd) 12561e1977eeSreyk { 12571e1977eeSreyk struct pollfd pfd[1]; 12581e1977eeSreyk int nready, hasdata = 0; 12591e1977eeSreyk 12601e1977eeSreyk pfd[0].fd = fd; 12611e1977eeSreyk pfd[0].events = POLLIN; 12621e1977eeSreyk nready = poll(pfd, 1, 0); 12631e1977eeSreyk if (nready == -1) 12641e1977eeSreyk log_warn("checking file descriptor for data failed"); 12651e1977eeSreyk else if (nready == 1 && pfd[0].revents & POLLIN) 12661e1977eeSreyk hasdata = 1; 12671e1977eeSreyk return (hasdata); 12681e1977eeSreyk } 12691e1977eeSreyk 12701e1977eeSreyk /* 12711e1977eeSreyk * mutex_lock 12721e1977eeSreyk * 12731e1977eeSreyk * Wrapper function for pthread_mutex_lock that does error checking and that 12741e1977eeSreyk * exits on failure 12751e1977eeSreyk */ 12761e1977eeSreyk void 12771e1977eeSreyk mutex_lock(pthread_mutex_t *m) 12781e1977eeSreyk { 12791e1977eeSreyk int ret; 12801e1977eeSreyk 12811e1977eeSreyk ret = pthread_mutex_lock(m); 12821e1977eeSreyk if (ret) { 12831e1977eeSreyk errno = ret; 12841e1977eeSreyk fatal("could not acquire mutex"); 12851e1977eeSreyk } 12861e1977eeSreyk } 12871e1977eeSreyk 12881e1977eeSreyk /* 12891e1977eeSreyk * mutex_unlock 12901e1977eeSreyk * 12911e1977eeSreyk * Wrapper function for pthread_mutex_unlock that does error checking and that 12921e1977eeSreyk * exits on failure 12931e1977eeSreyk */ 12941e1977eeSreyk void 12951e1977eeSreyk mutex_unlock(pthread_mutex_t *m) 12961e1977eeSreyk { 12971e1977eeSreyk int ret; 12981e1977eeSreyk 12991e1977eeSreyk ret = pthread_mutex_unlock(m); 13001e1977eeSreyk if (ret) { 13011e1977eeSreyk errno = ret; 13021e1977eeSreyk fatal("could not release mutex"); 13031e1977eeSreyk } 13041e1977eeSreyk } 1305ffc3523bSmlarkin 130608fd0ce3Spd 1307a246f7a0Sdv void 1308a246f7a0Sdv vm_pipe_init(struct vm_dev_pipe *p, void (*cb)(int, short, void *)) 1309a246f7a0Sdv { 1310a246f7a0Sdv vm_pipe_init2(p, cb, NULL); 1311a246f7a0Sdv } 1312a246f7a0Sdv 131308fd0ce3Spd /* 1314a246f7a0Sdv * vm_pipe_init2 131508fd0ce3Spd * 131608fd0ce3Spd * Initialize a vm_dev_pipe, setting up its file descriptors and its 1317a246f7a0Sdv * event structure with the given callback and argument. 131808fd0ce3Spd * 131908fd0ce3Spd * Parameters: 132008fd0ce3Spd * p: pointer to vm_dev_pipe struct to initizlize 132108fd0ce3Spd * cb: callback to use for READ events on the read end of the pipe 1322a246f7a0Sdv * arg: pointer to pass to the callback on event trigger 132308fd0ce3Spd */ 132408fd0ce3Spd void 1325a246f7a0Sdv vm_pipe_init2(struct vm_dev_pipe *p, void (*cb)(int, short, void *), void *arg) 132608fd0ce3Spd { 132708fd0ce3Spd int ret; 132808fd0ce3Spd int fds[2]; 132908fd0ce3Spd 133008fd0ce3Spd memset(p, 0, sizeof(struct vm_dev_pipe)); 133108fd0ce3Spd 1332b3bc6112Sdv ret = pipe2(fds, O_CLOEXEC); 133308fd0ce3Spd if (ret) 133408fd0ce3Spd fatal("failed to create vm_dev_pipe pipe"); 133508fd0ce3Spd 133608fd0ce3Spd p->read = fds[0]; 133708fd0ce3Spd p->write = fds[1]; 133808fd0ce3Spd 1339a246f7a0Sdv event_set(&p->read_ev, p->read, EV_READ | EV_PERSIST, cb, arg); 134008fd0ce3Spd } 134108fd0ce3Spd 134208fd0ce3Spd /* 134308fd0ce3Spd * vm_pipe_send 134408fd0ce3Spd * 1345a246f7a0Sdv * Send a message to an emulated device vie the provided vm_dev_pipe. This 1346a246f7a0Sdv * relies on the fact sizeof(msg) < PIPE_BUF to ensure atomic writes. 134708fd0ce3Spd * 134808fd0ce3Spd * Parameters: 134908fd0ce3Spd * p: pointer to initialized vm_dev_pipe 135008fd0ce3Spd * msg: message to send in the channel 135108fd0ce3Spd */ 135208fd0ce3Spd void 135308fd0ce3Spd vm_pipe_send(struct vm_dev_pipe *p, enum pipe_msg_type msg) 135408fd0ce3Spd { 135508fd0ce3Spd size_t n; 135608fd0ce3Spd n = write(p->write, &msg, sizeof(msg)); 135708fd0ce3Spd if (n != sizeof(msg)) 135808fd0ce3Spd fatal("failed to write to device pipe"); 135908fd0ce3Spd } 136008fd0ce3Spd 136108fd0ce3Spd /* 136208fd0ce3Spd * vm_pipe_recv 136308fd0ce3Spd * 136408fd0ce3Spd * Receive a message for an emulated device via the provided vm_dev_pipe. 1365a246f7a0Sdv * Returns the message value, otherwise will exit on failure. This relies on 1366a246f7a0Sdv * the fact sizeof(enum pipe_msg_type) < PIPE_BUF for atomic reads. 136708fd0ce3Spd * 136808fd0ce3Spd * Parameters: 136908fd0ce3Spd * p: pointer to initialized vm_dev_pipe 137008fd0ce3Spd * 137108fd0ce3Spd * Return values: 137208fd0ce3Spd * a value of enum pipe_msg_type or fatal exit on read(2) error 137308fd0ce3Spd */ 137408fd0ce3Spd enum pipe_msg_type 137508fd0ce3Spd vm_pipe_recv(struct vm_dev_pipe *p) 137608fd0ce3Spd { 137708fd0ce3Spd size_t n; 137808fd0ce3Spd enum pipe_msg_type msg; 137908fd0ce3Spd n = read(p->read, &msg, sizeof(msg)); 138008fd0ce3Spd if (n != sizeof(msg)) 138108fd0ce3Spd fatal("failed to read from device pipe"); 138208fd0ce3Spd 138308fd0ce3Spd return msg; 138408fd0ce3Spd } 13853481ecdfSdv 13863481ecdfSdv /* 1387b3bc6112Sdv * Re-map the guest address space using vmm(4)'s VMM_IOC_SHARE 13883481ecdfSdv * 13893481ecdfSdv * Returns 0 on success, non-zero in event of failure. 13903481ecdfSdv */ 13913481ecdfSdv int 13923c817da7Sdv remap_guest_mem(struct vmd_vm *vm, int vmm_fd) 13933481ecdfSdv { 13943481ecdfSdv struct vm_create_params *vcp; 13953481ecdfSdv struct vm_mem_range *vmr; 13963c817da7Sdv struct vm_sharemem_params vsp; 13973481ecdfSdv size_t i, j; 13983481ecdfSdv void *p = NULL; 13993481ecdfSdv int ret; 14003481ecdfSdv 14013481ecdfSdv if (vm == NULL) 14023481ecdfSdv return (1); 14033481ecdfSdv 14043481ecdfSdv vcp = &vm->vm_params.vmc_params; 14053481ecdfSdv 14063481ecdfSdv /* 14073c817da7Sdv * Initialize our VM shared memory request using our original 14083c817da7Sdv * creation parameters. We'll overwrite the va's after mmap(2). 14093c817da7Sdv */ 14103c817da7Sdv memset(&vsp, 0, sizeof(vsp)); 14113c817da7Sdv vsp.vsp_nmemranges = vcp->vcp_nmemranges; 14123c817da7Sdv vsp.vsp_vm_id = vcp->vcp_id; 14133c817da7Sdv memcpy(&vsp.vsp_memranges, &vcp->vcp_memranges, 14143c817da7Sdv sizeof(vsp.vsp_memranges)); 14153c817da7Sdv 14163c817da7Sdv /* 14173c817da7Sdv * Use mmap(2) to identify virtual address space for our mappings. 14183481ecdfSdv */ 14193481ecdfSdv for (i = 0; i < VMM_MAX_MEM_RANGES; i++) { 14203c817da7Sdv if (i < vsp.vsp_nmemranges) { 14213c817da7Sdv vmr = &vsp.vsp_memranges[i]; 14223c817da7Sdv 14233c817da7Sdv /* Ignore any MMIO ranges. */ 14243481ecdfSdv if (vmr->vmr_type == VM_MEM_MMIO) { 14253c817da7Sdv vmr->vmr_va = 0; 14263c817da7Sdv vcp->vcp_memranges[i].vmr_va = 0; 14273481ecdfSdv continue; 14283481ecdfSdv } 14293c817da7Sdv 14303c817da7Sdv /* Make initial mappings for the memrange. */ 14313c817da7Sdv p = mmap(NULL, vmr->vmr_size, PROT_READ, MAP_ANON, -1, 14323c817da7Sdv 0); 14333481ecdfSdv if (p == MAP_FAILED) { 14343481ecdfSdv ret = errno; 14353481ecdfSdv log_warn("%s: mmap", __func__); 14363481ecdfSdv for (j = 0; j < i; j++) { 14373481ecdfSdv vmr = &vcp->vcp_memranges[j]; 14383481ecdfSdv munmap((void *)vmr->vmr_va, 14393481ecdfSdv vmr->vmr_size); 14403481ecdfSdv } 14413481ecdfSdv return (ret); 14423481ecdfSdv } 14433481ecdfSdv vmr->vmr_va = (vaddr_t)p; 14443c817da7Sdv vcp->vcp_memranges[i].vmr_va = vmr->vmr_va; 14453481ecdfSdv } 14463481ecdfSdv } 14473481ecdfSdv 14483c817da7Sdv /* 14493c817da7Sdv * munmap(2) now that we have va's and ranges that don't overlap. vmm 14503c817da7Sdv * will use the va's and sizes to recreate the mappings for us. 14513c817da7Sdv */ 14523c817da7Sdv for (i = 0; i < vsp.vsp_nmemranges; i++) { 14533c817da7Sdv vmr = &vsp.vsp_memranges[i]; 14543c817da7Sdv if (vmr->vmr_type == VM_MEM_MMIO) 14553c817da7Sdv continue; 14563c817da7Sdv if (munmap((void*)vmr->vmr_va, vmr->vmr_size) == -1) 14573c817da7Sdv fatal("%s: munmap", __func__); 14583c817da7Sdv } 14593c817da7Sdv 14603c817da7Sdv /* 14613c817da7Sdv * Ask vmm to enter the shared mappings for us. They'll point 14623c817da7Sdv * to the same host physical memory, but will have a randomized 14633c817da7Sdv * virtual address for the calling process. 14643c817da7Sdv */ 14653c817da7Sdv if (ioctl(vmm_fd, VMM_IOC_SHAREMEM, &vsp) == -1) 14663c817da7Sdv return (errno); 14673c817da7Sdv 14683481ecdfSdv return (0); 14693481ecdfSdv } 1470c4fd4c5bSdv 1471c4fd4c5bSdv void 1472c4fd4c5bSdv vcpu_halt(uint32_t vcpu_id) 1473c4fd4c5bSdv { 1474c4fd4c5bSdv mutex_lock(&vm_mtx); 1475c4fd4c5bSdv vcpu_hlt[vcpu_id] = 1; 1476c4fd4c5bSdv mutex_unlock(&vm_mtx); 1477c4fd4c5bSdv } 1478c4fd4c5bSdv 1479c4fd4c5bSdv void 1480c4fd4c5bSdv vcpu_unhalt(uint32_t vcpu_id) 1481c4fd4c5bSdv { 1482c4fd4c5bSdv mutex_lock(&vm_mtx); 1483c4fd4c5bSdv vcpu_hlt[vcpu_id] = 0; 1484c4fd4c5bSdv mutex_unlock(&vm_mtx); 1485c4fd4c5bSdv } 1486c4fd4c5bSdv 1487c4fd4c5bSdv void 1488c4fd4c5bSdv vcpu_signal_run(uint32_t vcpu_id) 1489c4fd4c5bSdv { 1490c4fd4c5bSdv int ret; 1491c4fd4c5bSdv 1492c4fd4c5bSdv mutex_lock(&vcpu_run_mtx[vcpu_id]); 1493c4fd4c5bSdv ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]); 1494c4fd4c5bSdv if (ret) 1495c4fd4c5bSdv fatalx("%s: can't signal (%d)", __func__, ret); 1496c4fd4c5bSdv mutex_unlock(&vcpu_run_mtx[vcpu_id]); 1497c4fd4c5bSdv } 1498