1 /* $OpenBSD: vm.c,v 1.81 2023/01/08 19:57:17 dv Exp $ */ 2 3 /* 4 * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/param.h> /* PAGE_SIZE, MAXCOMLEN */ 20 #include <sys/types.h> 21 #include <sys/ioctl.h> 22 #include <sys/queue.h> 23 #include <sys/wait.h> 24 #include <sys/uio.h> 25 #include <sys/stat.h> 26 #include <sys/socket.h> 27 #include <sys/time.h> 28 #include <sys/mman.h> 29 #include <sys/resource.h> 30 31 #include <dev/ic/i8253reg.h> 32 #include <dev/isa/isareg.h> 33 #include <dev/pci/pcireg.h> 34 35 #include <machine/psl.h> 36 #include <machine/pte.h> 37 #include <machine/specialreg.h> 38 #include <machine/vmmvar.h> 39 40 #include <net/if.h> 41 42 #include <errno.h> 43 #include <event.h> 44 #include <fcntl.h> 45 #include <imsg.h> 46 #include <limits.h> 47 #include <poll.h> 48 #include <pthread.h> 49 #include <pthread_np.h> 50 #include <stddef.h> 51 #include <stdio.h> 52 #include <stdlib.h> 53 #include <string.h> 54 #include <unistd.h> 55 #include <util.h> 56 57 #include "atomicio.h" 58 #include "fw_cfg.h" 59 #include "i8253.h" 60 #include "i8259.h" 61 #include "loadfile.h" 62 #include "mc146818.h" 63 #include "mmio.h" 64 #include "ns8250.h" 65 #include "pci.h" 66 #include "virtio.h" 67 #include "vmd.h" 68 #include "vmm.h" 69 70 #define MB(x) (x * 1024UL * 1024UL) 71 #define GB(x) (x * 1024UL * 1024UL * 1024UL) 72 73 #define MMIO_NOTYET 0 74 75 io_fn_t ioports_map[MAX_PORTS]; 76 77 int run_vm(int, int[][VM_MAX_BASE_PER_DISK], int *, 78 struct vmop_create_params *, struct vcpu_reg_state *); 79 void vm_dispatch_vmm(int, short, void *); 80 void *event_thread(void *); 81 void *vcpu_run_loop(void *); 82 int vcpu_exit(struct vm_run_params *); 83 int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *); 84 void create_memory_map(struct vm_create_params *); 85 int alloc_guest_mem(struct vm_create_params *); 86 int vmm_create_vm(struct vm_create_params *); 87 void init_emulated_hw(struct vmop_create_params *, int, 88 int[][VM_MAX_BASE_PER_DISK], int *); 89 void restore_emulated_hw(struct vm_create_params *, int, int *, 90 int[][VM_MAX_BASE_PER_DISK],int); 91 void vcpu_exit_inout(struct vm_run_params *); 92 int vcpu_exit_eptviolation(struct vm_run_params *); 93 uint8_t vcpu_exit_pci(struct vm_run_params *); 94 int vcpu_pic_intr(uint32_t, uint32_t, uint8_t); 95 int loadfile_bios(gzFile, off_t, struct vcpu_reg_state *); 96 int send_vm(int, struct vm_create_params *); 97 int dump_send_header(int); 98 int dump_vmr(int , struct vm_mem_range *); 99 int dump_mem(int, struct vm_create_params *); 100 void restore_vmr(int, struct vm_mem_range *); 101 void restore_mem(int, struct vm_create_params *); 102 int restore_vm_params(int, struct vm_create_params *); 103 void pause_vm(struct vm_create_params *); 104 void unpause_vm(struct vm_create_params *); 105 106 int translate_gva(struct vm_exit*, uint64_t, uint64_t *, int); 107 108 static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t, 109 size_t); 110 111 int con_fd; 112 struct vmd_vm *current_vm; 113 114 extern struct vmd *env; 115 116 extern char *__progname; 117 118 pthread_mutex_t threadmutex; 119 pthread_cond_t threadcond; 120 121 pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM]; 122 pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM]; 123 pthread_barrier_t vm_pause_barrier; 124 pthread_cond_t vcpu_unpause_cond[VMM_MAX_VCPUS_PER_VM]; 125 pthread_mutex_t vcpu_unpause_mtx[VMM_MAX_VCPUS_PER_VM]; 126 uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM]; 127 uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM]; 128 129 /* 130 * Represents a standard register set for an OS to be booted 131 * as a flat 64 bit address space. 132 * 133 * NOT set here are: 134 * RIP 135 * RSP 136 * GDTR BASE 137 * 138 * Specific bootloaders should clone this structure and override 139 * those fields as needed. 140 * 141 * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on 142 * features of the CPU in use. 143 */ 144 static const struct vcpu_reg_state vcpu_init_flat64 = { 145 .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2, 146 .vrs_gprs[VCPU_REGS_RIP] = 0x0, 147 .vrs_gprs[VCPU_REGS_RSP] = 0x0, 148 .vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE | CR0_PG, 149 .vrs_crs[VCPU_REGS_CR3] = PML4_PAGE, 150 .vrs_crs[VCPU_REGS_CR4] = CR4_PAE | CR4_PSE, 151 .vrs_crs[VCPU_REGS_PDPTE0] = 0ULL, 152 .vrs_crs[VCPU_REGS_PDPTE1] = 0ULL, 153 .vrs_crs[VCPU_REGS_PDPTE2] = 0ULL, 154 .vrs_crs[VCPU_REGS_PDPTE3] = 0ULL, 155 .vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0}, 156 .vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 157 .vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 158 .vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 159 .vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 160 .vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 161 .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0}, 162 .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0}, 163 .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0}, 164 .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0}, 165 .vrs_msrs[VCPU_REGS_EFER] = EFER_LME | EFER_LMA, 166 .vrs_drs[VCPU_REGS_DR0] = 0x0, 167 .vrs_drs[VCPU_REGS_DR1] = 0x0, 168 .vrs_drs[VCPU_REGS_DR2] = 0x0, 169 .vrs_drs[VCPU_REGS_DR3] = 0x0, 170 .vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0, 171 .vrs_drs[VCPU_REGS_DR7] = 0x400, 172 .vrs_msrs[VCPU_REGS_STAR] = 0ULL, 173 .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL, 174 .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL, 175 .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL, 176 .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL, 177 .vrs_msrs[VCPU_REGS_MISC_ENABLE] = 0ULL, 178 .vrs_crs[VCPU_REGS_XCR0] = XCR0_X87 179 }; 180 181 /* 182 * Represents a standard register set for an BIOS to be booted 183 * as a flat 16 bit address space. 184 */ 185 static const struct vcpu_reg_state vcpu_init_flat16 = { 186 .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2, 187 .vrs_gprs[VCPU_REGS_RIP] = 0xFFF0, 188 .vrs_gprs[VCPU_REGS_RSP] = 0x0, 189 .vrs_crs[VCPU_REGS_CR0] = 0x60000010, 190 .vrs_crs[VCPU_REGS_CR3] = 0, 191 .vrs_sregs[VCPU_REGS_CS] = { 0xF000, 0xFFFF, 0x809F, 0xF0000}, 192 .vrs_sregs[VCPU_REGS_DS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 193 .vrs_sregs[VCPU_REGS_ES] = { 0x0, 0xFFFF, 0x8093, 0x0}, 194 .vrs_sregs[VCPU_REGS_FS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 195 .vrs_sregs[VCPU_REGS_GS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 196 .vrs_sregs[VCPU_REGS_SS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 197 .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0}, 198 .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0}, 199 .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0}, 200 .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0}, 201 .vrs_msrs[VCPU_REGS_EFER] = 0ULL, 202 .vrs_drs[VCPU_REGS_DR0] = 0x0, 203 .vrs_drs[VCPU_REGS_DR1] = 0x0, 204 .vrs_drs[VCPU_REGS_DR2] = 0x0, 205 .vrs_drs[VCPU_REGS_DR3] = 0x0, 206 .vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0, 207 .vrs_drs[VCPU_REGS_DR7] = 0x400, 208 .vrs_msrs[VCPU_REGS_STAR] = 0ULL, 209 .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL, 210 .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL, 211 .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL, 212 .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL, 213 .vrs_crs[VCPU_REGS_XCR0] = XCR0_X87 214 }; 215 216 /* 217 * loadfile_bios 218 * 219 * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image 220 * directly into memory. 221 * 222 * Parameters: 223 * fp: file of a kernel file to load 224 * size: uncompressed size of the image 225 * (out) vrs: register state to set on init for this kernel 226 * 227 * Return values: 228 * 0 if successful 229 * various error codes returned from read(2) or loadelf functions 230 */ 231 int 232 loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_state *vrs) 233 { 234 off_t off; 235 236 /* Set up a "flat 16 bit" register state for BIOS */ 237 memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs)); 238 239 /* Seek to the beginning of the BIOS image */ 240 if (gzseek(fp, 0, SEEK_SET) == -1) 241 return (-1); 242 243 /* The BIOS image must end at 1MB */ 244 if ((off = MB(1) - size) < 0) 245 return (-1); 246 247 /* Read BIOS image into memory */ 248 if (mread(fp, off, size) != (size_t)size) { 249 errno = EIO; 250 return (-1); 251 } 252 253 if (gzseek(fp, 0, SEEK_SET) == -1) 254 return (-1); 255 256 /* Read a second BIOS copy into memory ending at 4GB */ 257 off = GB(4) - size; 258 if (mread(fp, off, size) != (size_t)size) { 259 errno = EIO; 260 return (-1); 261 } 262 263 log_debug("%s: loaded BIOS image", __func__); 264 265 return (0); 266 } 267 268 /* 269 * start_vm 270 * 271 * After forking a new VM process, starts the new VM with the creation 272 * parameters supplied (in the incoming vm->vm_params field). This 273 * function performs a basic sanity check on the incoming parameters 274 * and then performs the following steps to complete the creation of the VM: 275 * 276 * 1. validates and create the new VM 277 * 2. opens the imsg control channel to the parent and drops more privilege 278 * 3. drops additional privileges by calling pledge(2) 279 * 4. loads the kernel from the disk image or file descriptor 280 * 5. runs the VM's VCPU loops. 281 * 282 * Parameters: 283 * vm: The VM data structure that is including the VM create parameters. 284 * fd: The imsg socket that is connected to the parent process. 285 * 286 * Return values: 287 * 0: success 288 * !0 : failure - typically an errno indicating the source of the failure 289 */ 290 int 291 start_vm(struct vmd_vm *vm, int fd) 292 { 293 struct vmop_create_params *vmc = &vm->vm_params; 294 struct vm_create_params *vcp = &vmc->vmc_params; 295 struct vcpu_reg_state vrs; 296 int nicfds[VMM_MAX_NICS_PER_VM]; 297 int ret; 298 gzFile fp; 299 size_t i; 300 struct vm_rwregs_params vrp; 301 struct stat sb; 302 303 /* Child */ 304 setproctitle("%s", vcp->vcp_name); 305 log_procinit(vcp->vcp_name); 306 307 if (!(vm->vm_state & VM_STATE_RECEIVED)) 308 create_memory_map(vcp); 309 310 ret = alloc_guest_mem(vcp); 311 312 if (ret) { 313 struct rlimit lim; 314 char buf[FMT_SCALED_STRSIZE]; 315 if (ret == ENOMEM && getrlimit(RLIMIT_DATA, &lim) == 0) { 316 if (fmt_scaled(lim.rlim_cur, buf) == 0) 317 fatalx("could not allocate guest memory (data " 318 "limit is %s)", buf); 319 } 320 errno = ret; 321 fatal("could not allocate guest memory"); 322 } 323 324 ret = vmm_create_vm(vcp); 325 current_vm = vm; 326 327 /* send back the kernel-generated vm id (0 on error) */ 328 if (atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) != 329 sizeof(vcp->vcp_id)) 330 fatal("failed to send created vm id to vmm process"); 331 332 if (ret) { 333 errno = ret; 334 fatal("create vmm ioctl failed - exiting"); 335 } 336 337 /* 338 * pledge in the vm processes: 339 * stdio - for malloc and basic I/O including events. 340 * recvfd - for send/recv. 341 * vmm - for the vmm ioctls and operations. 342 */ 343 if (pledge("stdio vmm recvfd", NULL) == -1) 344 fatal("pledge"); 345 346 if (vm->vm_state & VM_STATE_RECEIVED) { 347 ret = atomicio(read, vm->vm_receive_fd, &vrp, sizeof(vrp)); 348 if (ret != sizeof(vrp)) 349 fatal("received incomplete vrp - exiting"); 350 vrs = vrp.vrwp_regs; 351 } else { 352 /* 353 * Set up default "flat 64 bit" register state - RIP, 354 * RSP, and GDT info will be set in bootloader 355 */ 356 memcpy(&vrs, &vcpu_init_flat64, sizeof(vrs)); 357 358 /* Find and open kernel image */ 359 if ((fp = gzdopen(vm->vm_kernel, "r")) == NULL) 360 fatalx("failed to open kernel - exiting"); 361 362 /* Load kernel image */ 363 ret = loadfile_elf(fp, vcp, &vrs, vmc->vmc_bootdevice); 364 365 /* 366 * Try BIOS as a fallback (only if it was provided as an image 367 * with vm->vm_kernel and the file is not compressed) 368 */ 369 if (ret && errno == ENOEXEC && vm->vm_kernel != -1 && 370 gzdirect(fp) && (ret = fstat(vm->vm_kernel, &sb)) == 0) 371 ret = loadfile_bios(fp, sb.st_size, &vrs); 372 373 if (ret) 374 fatal("failed to load kernel or BIOS - exiting"); 375 376 gzclose(fp); 377 } 378 379 if (vm->vm_kernel != -1) 380 close(vm->vm_kernel); 381 382 con_fd = vm->vm_tty; 383 if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1) 384 fatal("failed to set nonblocking mode on console"); 385 386 for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) 387 nicfds[i] = vm->vm_ifs[i].vif_fd; 388 389 event_init(); 390 391 if (vm->vm_state & VM_STATE_RECEIVED) { 392 restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds, 393 vm->vm_disks, vm->vm_cdrom); 394 restore_mem(vm->vm_receive_fd, vcp); 395 if (restore_vm_params(vm->vm_receive_fd, vcp)) 396 fatal("restore vm params failed"); 397 unpause_vm(vcp); 398 } 399 400 if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1) 401 fatal("setup vm pipe"); 402 403 /* Execute the vcpu run loop(s) for this VM */ 404 ret = run_vm(vm->vm_cdrom, vm->vm_disks, nicfds, &vm->vm_params, &vrs); 405 406 /* Ensure that any in-flight data is written back */ 407 virtio_shutdown(vm); 408 409 return (ret); 410 } 411 412 /* 413 * vm_dispatch_vmm 414 * 415 * imsg callback for messages that are received from the vmm parent process. 416 */ 417 void 418 vm_dispatch_vmm(int fd, short event, void *arg) 419 { 420 struct vmd_vm *vm = arg; 421 struct vmop_result vmr; 422 struct vmop_addr_result var; 423 struct imsgev *iev = &vm->vm_iev; 424 struct imsgbuf *ibuf = &iev->ibuf; 425 struct imsg imsg; 426 ssize_t n; 427 int verbose; 428 429 if (event & EV_READ) { 430 if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN) 431 fatal("%s: imsg_read", __func__); 432 if (n == 0) 433 _exit(0); 434 } 435 436 if (event & EV_WRITE) { 437 if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN) 438 fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd); 439 if (n == 0) 440 _exit(0); 441 } 442 443 for (;;) { 444 if ((n = imsg_get(ibuf, &imsg)) == -1) 445 fatal("%s: imsg_get", __func__); 446 if (n == 0) 447 break; 448 449 #if DEBUG > 1 450 log_debug("%s: got imsg %d from %s", 451 __func__, imsg.hdr.type, 452 vm->vm_params.vmc_params.vcp_name); 453 #endif 454 455 switch (imsg.hdr.type) { 456 case IMSG_CTL_VERBOSE: 457 IMSG_SIZE_CHECK(&imsg, &verbose); 458 memcpy(&verbose, imsg.data, sizeof(verbose)); 459 log_setverbose(verbose); 460 break; 461 case IMSG_VMDOP_VM_SHUTDOWN: 462 if (vmmci_ctl(VMMCI_SHUTDOWN) == -1) 463 _exit(0); 464 break; 465 case IMSG_VMDOP_VM_REBOOT: 466 if (vmmci_ctl(VMMCI_REBOOT) == -1) 467 _exit(0); 468 break; 469 case IMSG_VMDOP_PAUSE_VM: 470 vmr.vmr_result = 0; 471 vmr.vmr_id = vm->vm_vmid; 472 pause_vm(&vm->vm_params.vmc_params); 473 imsg_compose_event(&vm->vm_iev, 474 IMSG_VMDOP_PAUSE_VM_RESPONSE, 475 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 476 sizeof(vmr)); 477 break; 478 case IMSG_VMDOP_UNPAUSE_VM: 479 vmr.vmr_result = 0; 480 vmr.vmr_id = vm->vm_vmid; 481 unpause_vm(&vm->vm_params.vmc_params); 482 imsg_compose_event(&vm->vm_iev, 483 IMSG_VMDOP_UNPAUSE_VM_RESPONSE, 484 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 485 sizeof(vmr)); 486 break; 487 case IMSG_VMDOP_SEND_VM_REQUEST: 488 vmr.vmr_id = vm->vm_vmid; 489 vmr.vmr_result = send_vm(imsg.fd, 490 &vm->vm_params.vmc_params); 491 imsg_compose_event(&vm->vm_iev, 492 IMSG_VMDOP_SEND_VM_RESPONSE, 493 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 494 sizeof(vmr)); 495 if (!vmr.vmr_result) { 496 imsg_flush(¤t_vm->vm_iev.ibuf); 497 _exit(0); 498 } 499 break; 500 case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE: 501 IMSG_SIZE_CHECK(&imsg, &var); 502 memcpy(&var, imsg.data, sizeof(var)); 503 504 log_debug("%s: received tap addr %s for nic %d", 505 vm->vm_params.vmc_params.vcp_name, 506 ether_ntoa((void *)var.var_addr), var.var_nic_idx); 507 508 vionet_set_hostmac(vm, var.var_nic_idx, var.var_addr); 509 break; 510 default: 511 fatalx("%s: got invalid imsg %d from %s", 512 __func__, imsg.hdr.type, 513 vm->vm_params.vmc_params.vcp_name); 514 } 515 imsg_free(&imsg); 516 } 517 imsg_event_add(iev); 518 } 519 520 /* 521 * vm_shutdown 522 * 523 * Tell the vmm parent process to shutdown or reboot the VM and exit. 524 */ 525 __dead void 526 vm_shutdown(unsigned int cmd) 527 { 528 switch (cmd) { 529 case VMMCI_NONE: 530 case VMMCI_SHUTDOWN: 531 (void)imsg_compose_event(¤t_vm->vm_iev, 532 IMSG_VMDOP_VM_SHUTDOWN, 0, 0, -1, NULL, 0); 533 break; 534 case VMMCI_REBOOT: 535 (void)imsg_compose_event(¤t_vm->vm_iev, 536 IMSG_VMDOP_VM_REBOOT, 0, 0, -1, NULL, 0); 537 break; 538 default: 539 fatalx("invalid vm ctl command: %d", cmd); 540 } 541 imsg_flush(¤t_vm->vm_iev.ibuf); 542 543 _exit(0); 544 } 545 546 int 547 send_vm(int fd, struct vm_create_params *vcp) 548 { 549 struct vm_rwregs_params vrp; 550 struct vm_rwvmparams_params vpp; 551 struct vmop_create_params *vmc; 552 struct vm_terminate_params vtp; 553 unsigned int flags = 0; 554 unsigned int i; 555 int ret = 0; 556 size_t sz; 557 558 if (dump_send_header(fd)) { 559 log_info("%s: failed to send vm dump header", __func__); 560 goto err; 561 } 562 563 pause_vm(vcp); 564 565 vmc = calloc(1, sizeof(struct vmop_create_params)); 566 if (vmc == NULL) { 567 log_warn("%s: calloc error getting vmc", __func__); 568 ret = -1; 569 goto err; 570 } 571 572 flags |= VMOP_CREATE_MEMORY; 573 memcpy(&vmc->vmc_params, ¤t_vm->vm_params, sizeof(struct 574 vmop_create_params)); 575 vmc->vmc_flags = flags; 576 vrp.vrwp_vm_id = vcp->vcp_id; 577 vrp.vrwp_mask = VM_RWREGS_ALL; 578 vpp.vpp_mask = VM_RWVMPARAMS_ALL; 579 vpp.vpp_vm_id = vcp->vcp_id; 580 581 sz = atomicio(vwrite, fd, vmc,sizeof(struct vmop_create_params)); 582 if (sz != sizeof(struct vmop_create_params)) { 583 ret = -1; 584 goto err; 585 } 586 587 for (i = 0; i < vcp->vcp_ncpus; i++) { 588 vrp.vrwp_vcpu_id = i; 589 if ((ret = ioctl(env->vmd_fd, VMM_IOC_READREGS, &vrp))) { 590 log_warn("%s: readregs failed", __func__); 591 goto err; 592 } 593 594 sz = atomicio(vwrite, fd, &vrp, 595 sizeof(struct vm_rwregs_params)); 596 if (sz != sizeof(struct vm_rwregs_params)) { 597 log_warn("%s: dumping registers failed", __func__); 598 ret = -1; 599 goto err; 600 } 601 } 602 603 if ((ret = i8253_dump(fd))) 604 goto err; 605 if ((ret = i8259_dump(fd))) 606 goto err; 607 if ((ret = ns8250_dump(fd))) 608 goto err; 609 if ((ret = mc146818_dump(fd))) 610 goto err; 611 if ((ret = fw_cfg_dump(fd))) 612 goto err; 613 if ((ret = pci_dump(fd))) 614 goto err; 615 if ((ret = virtio_dump(fd))) 616 goto err; 617 if ((ret = dump_mem(fd, vcp))) 618 goto err; 619 620 for (i = 0; i < vcp->vcp_ncpus; i++) { 621 vpp.vpp_vcpu_id = i; 622 if ((ret = ioctl(env->vmd_fd, VMM_IOC_READVMPARAMS, &vpp))) { 623 log_warn("%s: readvmparams failed", __func__); 624 goto err; 625 } 626 627 sz = atomicio(vwrite, fd, &vpp, 628 sizeof(struct vm_rwvmparams_params)); 629 if (sz != sizeof(struct vm_rwvmparams_params)) { 630 log_warn("%s: dumping vm params failed", __func__); 631 ret = -1; 632 goto err; 633 } 634 } 635 636 vtp.vtp_vm_id = vcp->vcp_id; 637 if (ioctl(env->vmd_fd, VMM_IOC_TERM, &vtp) == -1) { 638 log_warnx("%s: term IOC error: %d, %d", __func__, 639 errno, ENOENT); 640 } 641 err: 642 close(fd); 643 if (ret) 644 unpause_vm(vcp); 645 return ret; 646 } 647 648 int 649 dump_send_header(int fd) { 650 struct vm_dump_header vmh; 651 int i; 652 653 memcpy(&vmh.vmh_signature, VM_DUMP_SIGNATURE, 654 sizeof(vmh.vmh_signature)); 655 656 vmh.vmh_cpuids[0].code = 0x00; 657 vmh.vmh_cpuids[0].leaf = 0x00; 658 659 vmh.vmh_cpuids[1].code = 0x01; 660 vmh.vmh_cpuids[1].leaf = 0x00; 661 662 vmh.vmh_cpuids[2].code = 0x07; 663 vmh.vmh_cpuids[2].leaf = 0x00; 664 665 vmh.vmh_cpuids[3].code = 0x0d; 666 vmh.vmh_cpuids[3].leaf = 0x00; 667 668 vmh.vmh_cpuids[4].code = 0x80000001; 669 vmh.vmh_cpuids[4].leaf = 0x00; 670 671 vmh.vmh_version = VM_DUMP_VERSION; 672 673 for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) { 674 CPUID_LEAF(vmh.vmh_cpuids[i].code, 675 vmh.vmh_cpuids[i].leaf, 676 vmh.vmh_cpuids[i].a, 677 vmh.vmh_cpuids[i].b, 678 vmh.vmh_cpuids[i].c, 679 vmh.vmh_cpuids[i].d); 680 } 681 682 if (atomicio(vwrite, fd, &vmh, sizeof(vmh)) != sizeof(vmh)) 683 return (-1); 684 685 return (0); 686 } 687 688 int 689 dump_mem(int fd, struct vm_create_params *vcp) 690 { 691 unsigned int i; 692 int ret; 693 struct vm_mem_range *vmr; 694 695 for (i = 0; i < vcp->vcp_nmemranges; i++) { 696 vmr = &vcp->vcp_memranges[i]; 697 ret = dump_vmr(fd, vmr); 698 if (ret) 699 return ret; 700 } 701 return (0); 702 } 703 704 int 705 restore_vm_params(int fd, struct vm_create_params *vcp) { 706 unsigned int i; 707 struct vm_rwvmparams_params vpp; 708 709 for (i = 0; i < vcp->vcp_ncpus; i++) { 710 if (atomicio(read, fd, &vpp, sizeof(vpp)) != sizeof(vpp)) { 711 log_warn("%s: error restoring vm params", __func__); 712 return (-1); 713 } 714 vpp.vpp_vm_id = vcp->vcp_id; 715 vpp.vpp_vcpu_id = i; 716 if (ioctl(env->vmd_fd, VMM_IOC_WRITEVMPARAMS, &vpp) < 0) { 717 log_debug("%s: writing vm params failed", __func__); 718 return (-1); 719 } 720 } 721 return (0); 722 } 723 724 void 725 restore_mem(int fd, struct vm_create_params *vcp) 726 { 727 unsigned int i; 728 struct vm_mem_range *vmr; 729 730 for (i = 0; i < vcp->vcp_nmemranges; i++) { 731 vmr = &vcp->vcp_memranges[i]; 732 restore_vmr(fd, vmr); 733 } 734 } 735 736 int 737 dump_vmr(int fd, struct vm_mem_range *vmr) 738 { 739 size_t rem = vmr->vmr_size, read=0; 740 char buf[PAGE_SIZE]; 741 742 while (rem > 0) { 743 if (read_mem(vmr->vmr_gpa + read, buf, PAGE_SIZE)) { 744 log_warn("failed to read vmr"); 745 return (-1); 746 } 747 if (atomicio(vwrite, fd, buf, sizeof(buf)) != sizeof(buf)) { 748 log_warn("failed to dump vmr"); 749 return (-1); 750 } 751 rem = rem - PAGE_SIZE; 752 read = read + PAGE_SIZE; 753 } 754 return (0); 755 } 756 757 void 758 restore_vmr(int fd, struct vm_mem_range *vmr) 759 { 760 size_t rem = vmr->vmr_size, wrote=0; 761 char buf[PAGE_SIZE]; 762 763 while (rem > 0) { 764 if (atomicio(read, fd, buf, sizeof(buf)) != sizeof(buf)) 765 fatal("failed to restore vmr"); 766 if (write_mem(vmr->vmr_gpa + wrote, buf, PAGE_SIZE)) 767 fatal("failed to write vmr"); 768 rem = rem - PAGE_SIZE; 769 wrote = wrote + PAGE_SIZE; 770 } 771 } 772 773 void 774 pause_vm(struct vm_create_params *vcp) 775 { 776 unsigned int n; 777 int ret; 778 if (current_vm->vm_state & VM_STATE_PAUSED) 779 return; 780 781 current_vm->vm_state |= VM_STATE_PAUSED; 782 783 ret = pthread_barrier_init(&vm_pause_barrier, NULL, vcp->vcp_ncpus + 1); 784 if (ret) { 785 log_warnx("%s: cannot initialize pause barrier (%d)", 786 __progname, ret); 787 return; 788 } 789 790 for (n = 0; n < vcp->vcp_ncpus; n++) { 791 ret = pthread_cond_broadcast(&vcpu_run_cond[n]); 792 if (ret) { 793 log_warnx("%s: can't broadcast vcpu run cond (%d)", 794 __func__, (int)ret); 795 return; 796 } 797 } 798 ret = pthread_barrier_wait(&vm_pause_barrier); 799 if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) { 800 log_warnx("%s: could not wait on pause barrier (%d)", 801 __func__, (int)ret); 802 return; 803 } 804 805 ret = pthread_barrier_destroy(&vm_pause_barrier); 806 if (ret) { 807 log_warnx("%s: could not destroy pause barrier (%d)", 808 __progname, ret); 809 return; 810 } 811 812 i8253_stop(); 813 mc146818_stop(); 814 ns8250_stop(); 815 virtio_stop(vcp); 816 } 817 818 void 819 unpause_vm(struct vm_create_params *vcp) 820 { 821 unsigned int n; 822 int ret; 823 if (!(current_vm->vm_state & VM_STATE_PAUSED)) 824 return; 825 826 current_vm->vm_state &= ~VM_STATE_PAUSED; 827 for (n = 0; n < vcp->vcp_ncpus; n++) { 828 ret = pthread_cond_broadcast(&vcpu_unpause_cond[n]); 829 if (ret) { 830 log_warnx("%s: can't broadcast vcpu unpause cond (%d)", 831 __func__, (int)ret); 832 return; 833 } 834 } 835 836 i8253_start(); 837 mc146818_start(); 838 ns8250_start(); 839 virtio_start(vcp); 840 } 841 842 /* 843 * vcpu_reset 844 * 845 * Requests vmm(4) to reset the VCPUs in the indicated VM to 846 * the register state provided 847 * 848 * Parameters 849 * vmid: VM ID to reset 850 * vcpu_id: VCPU ID to reset 851 * vrs: the register state to initialize 852 * 853 * Return values: 854 * 0: success 855 * !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not 856 * valid) 857 */ 858 int 859 vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs) 860 { 861 struct vm_resetcpu_params vrp; 862 863 memset(&vrp, 0, sizeof(vrp)); 864 vrp.vrp_vm_id = vmid; 865 vrp.vrp_vcpu_id = vcpu_id; 866 memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state)); 867 868 log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid); 869 870 if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) == -1) 871 return (errno); 872 873 return (0); 874 } 875 876 /* 877 * create_memory_map 878 * 879 * Sets up the guest physical memory ranges that the VM can access. 880 * 881 * Parameters: 882 * vcp: VM create parameters describing the VM whose memory map 883 * is being created 884 * 885 * Return values: 886 * nothing 887 */ 888 void 889 create_memory_map(struct vm_create_params *vcp) 890 { 891 size_t len, mem_bytes; 892 size_t above_1m = 0, above_4g = 0; 893 894 mem_bytes = vcp->vcp_memranges[0].vmr_size; 895 vcp->vcp_nmemranges = 0; 896 if (mem_bytes == 0 || mem_bytes > VMM_MAX_VM_MEM_SIZE) 897 return; 898 899 /* First memory region: 0 - LOWMEM_KB (DOS low mem) */ 900 len = LOWMEM_KB * 1024; 901 vcp->vcp_memranges[0].vmr_gpa = 0x0; 902 vcp->vcp_memranges[0].vmr_size = len; 903 vcp->vcp_memranges[0].vmr_type = VM_MEM_RAM; 904 mem_bytes -= len; 905 906 /* 907 * Second memory region: LOWMEM_KB - 1MB. 908 * 909 * N.B. - Normally ROMs or parts of video RAM are mapped here. 910 * We have to add this region, because some systems 911 * unconditionally write to 0xb8000 (VGA RAM), and 912 * we need to make sure that vmm(4) permits accesses 913 * to it. So allocate guest memory for it. 914 */ 915 len = MB(1) - (LOWMEM_KB * 1024); 916 vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024; 917 vcp->vcp_memranges[1].vmr_size = len; 918 vcp->vcp_memranges[1].vmr_type = VM_MEM_RESERVED; 919 mem_bytes -= len; 920 921 /* If we have less than 2MB remaining, still create a 2nd BIOS area. */ 922 if (mem_bytes <= MB(2)) { 923 vcp->vcp_memranges[2].vmr_gpa = VMM_PCI_MMIO_BAR_END; 924 vcp->vcp_memranges[2].vmr_size = MB(2); 925 vcp->vcp_memranges[2].vmr_type = VM_MEM_RESERVED; 926 vcp->vcp_nmemranges = 3; 927 return; 928 } 929 930 /* 931 * Calculate the how to split any remaining memory across the 4GB 932 * boundary while making sure we do not place physical memory into 933 * MMIO ranges. 934 */ 935 if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - MB(1)) { 936 above_1m = VMM_PCI_MMIO_BAR_BASE - MB(1); 937 above_4g = mem_bytes - above_1m; 938 } else { 939 above_1m = mem_bytes; 940 above_4g = 0; 941 } 942 943 /* Third memory region: area above 1MB to MMIO region */ 944 vcp->vcp_memranges[2].vmr_gpa = MB(1); 945 vcp->vcp_memranges[2].vmr_size = above_1m; 946 vcp->vcp_memranges[2].vmr_type = VM_MEM_RAM; 947 948 /* Fourth region: PCI MMIO range */ 949 vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_BASE; 950 vcp->vcp_memranges[3].vmr_size = VMM_PCI_MMIO_BAR_END - 951 VMM_PCI_MMIO_BAR_BASE + 1; 952 vcp->vcp_memranges[3].vmr_type = VM_MEM_MMIO; 953 954 /* Fifth region: 2nd copy of BIOS above MMIO ending at 4GB */ 955 vcp->vcp_memranges[4].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1; 956 vcp->vcp_memranges[4].vmr_size = MB(2); 957 vcp->vcp_memranges[4].vmr_type = VM_MEM_RESERVED; 958 959 /* Sixth region: any remainder above 4GB */ 960 if (above_4g > 0) { 961 vcp->vcp_memranges[5].vmr_gpa = GB(4); 962 vcp->vcp_memranges[5].vmr_size = above_4g; 963 vcp->vcp_memranges[5].vmr_type = VM_MEM_RAM; 964 vcp->vcp_nmemranges = 6; 965 } else 966 vcp->vcp_nmemranges = 5; 967 } 968 969 /* 970 * alloc_guest_mem 971 * 972 * Allocates memory for the guest. 973 * Instead of doing a single allocation with one mmap(), we allocate memory 974 * separately for every range for the following reasons: 975 * - ASLR for the individual ranges 976 * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to 977 * map the single mmap'd userspace memory to the individual guest physical 978 * memory ranges, the underlying amap of the single mmap'd range would have 979 * to allocate per-page reference counters. The reason is that the 980 * individual guest physical ranges would reference the single mmap'd region 981 * only partially. However, if every guest physical range has its own 982 * corresponding mmap'd userspace allocation, there are no partial 983 * references: every guest physical range fully references an mmap'd 984 * range => no per-page reference counters have to be allocated. 985 * 986 * Return values: 987 * 0: success 988 * !0: failure - errno indicating the source of the failure 989 */ 990 int 991 alloc_guest_mem(struct vm_create_params *vcp) 992 { 993 void *p; 994 int ret; 995 size_t i, j; 996 struct vm_mem_range *vmr; 997 998 for (i = 0; i < vcp->vcp_nmemranges; i++) { 999 vmr = &vcp->vcp_memranges[i]; 1000 p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE, 1001 MAP_PRIVATE | MAP_ANON, -1, 0); 1002 if (p == MAP_FAILED) { 1003 ret = errno; 1004 for (j = 0; j < i; j++) { 1005 vmr = &vcp->vcp_memranges[j]; 1006 munmap((void *)vmr->vmr_va, vmr->vmr_size); 1007 } 1008 1009 return (ret); 1010 } 1011 1012 vmr->vmr_va = (vaddr_t)p; 1013 } 1014 1015 return (0); 1016 } 1017 1018 /* 1019 * vmm_create_vm 1020 * 1021 * Requests vmm(4) to create a new VM using the supplied creation 1022 * parameters. This operation results in the creation of the in-kernel 1023 * structures for the VM, but does not start the VM's vcpu(s). 1024 * 1025 * Parameters: 1026 * vcp: vm_create_params struct containing the VM's desired creation 1027 * configuration 1028 * 1029 * Return values: 1030 * 0: success 1031 * !0 : ioctl to vmm(4) failed 1032 */ 1033 int 1034 vmm_create_vm(struct vm_create_params *vcp) 1035 { 1036 /* Sanity check arguments */ 1037 if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) 1038 return (EINVAL); 1039 1040 if (vcp->vcp_nmemranges == 0 || 1041 vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES) 1042 return (EINVAL); 1043 1044 if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) 1045 return (EINVAL); 1046 1047 if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) 1048 return (EINVAL); 1049 1050 if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) == -1) 1051 return (errno); 1052 1053 return (0); 1054 } 1055 1056 /* 1057 * init_emulated_hw 1058 * 1059 * Initializes the userspace hardware emulation 1060 */ 1061 void 1062 init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom, 1063 int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps) 1064 { 1065 struct vm_create_params *vcp = &vmc->vmc_params; 1066 size_t i; 1067 uint64_t memlo, memhi; 1068 1069 /* Calculate memory size for NVRAM registers */ 1070 memlo = memhi = 0; 1071 for (i = 0; i < vcp->vcp_nmemranges; i++) { 1072 if (vcp->vcp_memranges[i].vmr_gpa == MB(1) && 1073 vcp->vcp_memranges[i].vmr_size > (15 * MB(1))) 1074 memlo = vcp->vcp_memranges[i].vmr_size - (15 * MB(1)); 1075 else if (vcp->vcp_memranges[i].vmr_gpa == GB(4)) 1076 memhi = vcp->vcp_memranges[i].vmr_size; 1077 } 1078 1079 /* Reset the IO port map */ 1080 memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS); 1081 1082 /* Init i8253 PIT */ 1083 i8253_init(vcp->vcp_id); 1084 ioports_map[TIMER_CTRL] = vcpu_exit_i8253; 1085 ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253; 1086 ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253; 1087 ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253; 1088 ioports_map[PCKBC_AUX] = vcpu_exit_i8253_misc; 1089 1090 /* Init mc146818 RTC */ 1091 mc146818_init(vcp->vcp_id, memlo, memhi); 1092 ioports_map[IO_RTC] = vcpu_exit_mc146818; 1093 ioports_map[IO_RTC + 1] = vcpu_exit_mc146818; 1094 1095 /* Init master and slave PICs */ 1096 i8259_init(); 1097 ioports_map[IO_ICU1] = vcpu_exit_i8259; 1098 ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259; 1099 ioports_map[IO_ICU2] = vcpu_exit_i8259; 1100 ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259; 1101 ioports_map[ELCR0] = vcpu_exit_elcr; 1102 ioports_map[ELCR1] = vcpu_exit_elcr; 1103 1104 /* Init ns8250 UART */ 1105 ns8250_init(con_fd, vcp->vcp_id); 1106 for (i = COM1_DATA; i <= COM1_SCR; i++) 1107 ioports_map[i] = vcpu_exit_com; 1108 1109 /* Init QEMU fw_cfg interface */ 1110 fw_cfg_init(vmc); 1111 ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg; 1112 ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg; 1113 ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma; 1114 ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma; 1115 1116 /* Initialize PCI */ 1117 for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++) 1118 ioports_map[i] = vcpu_exit_pci; 1119 1120 ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci; 1121 ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci; 1122 ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci; 1123 ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci; 1124 ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci; 1125 pci_init(); 1126 1127 /* Initialize virtio devices */ 1128 virtio_init(current_vm, child_cdrom, child_disks, child_taps); 1129 } 1130 /* 1131 * restore_emulated_hw 1132 * 1133 * Restores the userspace hardware emulation from fd 1134 */ 1135 void 1136 restore_emulated_hw(struct vm_create_params *vcp, int fd, 1137 int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom) 1138 { 1139 /* struct vm_create_params *vcp = &vmc->vmc_params; */ 1140 int i; 1141 memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS); 1142 1143 /* Init i8253 PIT */ 1144 i8253_restore(fd, vcp->vcp_id); 1145 ioports_map[TIMER_CTRL] = vcpu_exit_i8253; 1146 ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253; 1147 ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253; 1148 ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253; 1149 1150 /* Init master and slave PICs */ 1151 i8259_restore(fd); 1152 ioports_map[IO_ICU1] = vcpu_exit_i8259; 1153 ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259; 1154 ioports_map[IO_ICU2] = vcpu_exit_i8259; 1155 ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259; 1156 1157 /* Init ns8250 UART */ 1158 ns8250_restore(fd, con_fd, vcp->vcp_id); 1159 for (i = COM1_DATA; i <= COM1_SCR; i++) 1160 ioports_map[i] = vcpu_exit_com; 1161 1162 /* Init mc146818 RTC */ 1163 mc146818_restore(fd, vcp->vcp_id); 1164 ioports_map[IO_RTC] = vcpu_exit_mc146818; 1165 ioports_map[IO_RTC + 1] = vcpu_exit_mc146818; 1166 1167 /* Init QEMU fw_cfg interface */ 1168 fw_cfg_restore(fd); 1169 ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg; 1170 ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg; 1171 ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma; 1172 ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma; 1173 1174 /* Initialize PCI */ 1175 for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++) 1176 ioports_map[i] = vcpu_exit_pci; 1177 1178 ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci; 1179 ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci; 1180 ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci; 1181 ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci; 1182 ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci; 1183 pci_restore(fd); 1184 virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps); 1185 } 1186 1187 /* 1188 * run_vm 1189 * 1190 * Runs the VM whose creation parameters are specified in vcp 1191 * 1192 * Parameters: 1193 * child_cdrom: previously-opened child ISO disk file descriptor 1194 * child_disks: previously-opened child VM disk file file descriptors 1195 * child_taps: previously-opened child tap file descriptors 1196 * vmc: vmop_create_params struct containing the VM's desired creation 1197 * configuration 1198 * vrs: VCPU register state to initialize 1199 * 1200 * Return values: 1201 * 0: the VM exited normally 1202 * !0 : the VM exited abnormally or failed to start 1203 */ 1204 int 1205 run_vm(int child_cdrom, int child_disks[][VM_MAX_BASE_PER_DISK], 1206 int *child_taps, struct vmop_create_params *vmc, 1207 struct vcpu_reg_state *vrs) 1208 { 1209 struct vm_create_params *vcp = &vmc->vmc_params; 1210 struct vm_rwregs_params vregsp; 1211 uint8_t evdone = 0; 1212 size_t i; 1213 int ret; 1214 pthread_t *tid, evtid; 1215 char tname[MAXCOMLEN + 1]; 1216 struct vm_run_params **vrp; 1217 void *exit_status; 1218 1219 if (vcp == NULL) 1220 return (EINVAL); 1221 1222 if (child_cdrom == -1 && strlen(vcp->vcp_cdrom)) 1223 return (EINVAL); 1224 1225 if (child_disks == NULL && vcp->vcp_ndisks != 0) 1226 return (EINVAL); 1227 1228 if (child_taps == NULL && vcp->vcp_nnics != 0) 1229 return (EINVAL); 1230 1231 if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) 1232 return (EINVAL); 1233 1234 if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) 1235 return (EINVAL); 1236 1237 if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) 1238 return (EINVAL); 1239 1240 if (vcp->vcp_nmemranges == 0 || 1241 vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES) 1242 return (EINVAL); 1243 1244 tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t)); 1245 vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *)); 1246 if (tid == NULL || vrp == NULL) { 1247 log_warn("%s: memory allocation error - exiting.", 1248 __progname); 1249 return (ENOMEM); 1250 } 1251 1252 log_debug("%s: initializing hardware for vm %s", __func__, 1253 vcp->vcp_name); 1254 1255 if (!(current_vm->vm_state & VM_STATE_RECEIVED)) 1256 init_emulated_hw(vmc, child_cdrom, child_disks, child_taps); 1257 1258 ret = pthread_mutex_init(&threadmutex, NULL); 1259 if (ret) { 1260 log_warn("%s: could not initialize thread state mutex", 1261 __func__); 1262 return (ret); 1263 } 1264 ret = pthread_cond_init(&threadcond, NULL); 1265 if (ret) { 1266 log_warn("%s: could not initialize thread state " 1267 "condition variable", __func__); 1268 return (ret); 1269 } 1270 1271 mutex_lock(&threadmutex); 1272 1273 log_debug("%s: starting vcpu threads for vm %s", __func__, 1274 vcp->vcp_name); 1275 1276 /* 1277 * Create and launch one thread for each VCPU. These threads may 1278 * migrate between PCPUs over time; the need to reload CPU state 1279 * in such situations is detected and performed by vmm(4) in the 1280 * kernel. 1281 */ 1282 for (i = 0 ; i < vcp->vcp_ncpus; i++) { 1283 vrp[i] = malloc(sizeof(struct vm_run_params)); 1284 if (vrp[i] == NULL) { 1285 log_warn("%s: memory allocation error - " 1286 "exiting.", __progname); 1287 /* caller will exit, so skip freeing */ 1288 return (ENOMEM); 1289 } 1290 vrp[i]->vrp_exit = malloc(sizeof(struct vm_exit)); 1291 if (vrp[i]->vrp_exit == NULL) { 1292 log_warn("%s: memory allocation error - " 1293 "exiting.", __progname); 1294 /* caller will exit, so skip freeing */ 1295 return (ENOMEM); 1296 } 1297 vrp[i]->vrp_vm_id = vcp->vcp_id; 1298 vrp[i]->vrp_vcpu_id = i; 1299 1300 if (vcpu_reset(vcp->vcp_id, i, vrs)) { 1301 log_warnx("%s: cannot reset VCPU %zu - exiting.", 1302 __progname, i); 1303 return (EIO); 1304 } 1305 1306 /* once more because reset_cpu changes regs */ 1307 if (current_vm->vm_state & VM_STATE_RECEIVED) { 1308 vregsp.vrwp_vm_id = vcp->vcp_id; 1309 vregsp.vrwp_vcpu_id = i; 1310 vregsp.vrwp_regs = *vrs; 1311 vregsp.vrwp_mask = VM_RWREGS_ALL; 1312 if ((ret = ioctl(env->vmd_fd, VMM_IOC_WRITEREGS, 1313 &vregsp)) == -1) { 1314 log_warn("%s: writeregs failed", __func__); 1315 return (ret); 1316 } 1317 } 1318 1319 ret = pthread_cond_init(&vcpu_run_cond[i], NULL); 1320 if (ret) { 1321 log_warnx("%s: cannot initialize cond var (%d)", 1322 __progname, ret); 1323 return (ret); 1324 } 1325 1326 ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL); 1327 if (ret) { 1328 log_warnx("%s: cannot initialize mtx (%d)", 1329 __progname, ret); 1330 return (ret); 1331 } 1332 1333 ret = pthread_cond_init(&vcpu_unpause_cond[i], NULL); 1334 if (ret) { 1335 log_warnx("%s: cannot initialize unpause var (%d)", 1336 __progname, ret); 1337 return (ret); 1338 } 1339 1340 ret = pthread_mutex_init(&vcpu_unpause_mtx[i], NULL); 1341 if (ret) { 1342 log_warnx("%s: cannot initialize unpause mtx (%d)", 1343 __progname, ret); 1344 return (ret); 1345 } 1346 1347 vcpu_hlt[i] = 0; 1348 1349 /* Start each VCPU run thread at vcpu_run_loop */ 1350 ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]); 1351 if (ret) { 1352 /* caller will _exit after this return */ 1353 ret = errno; 1354 log_warn("%s: could not create vcpu thread %zu", 1355 __func__, i); 1356 return (ret); 1357 } 1358 1359 snprintf(tname, sizeof(tname), "vcpu-%zu", i); 1360 pthread_set_name_np(tid[i], tname); 1361 } 1362 1363 log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name); 1364 ret = pthread_create(&evtid, NULL, event_thread, &evdone); 1365 if (ret) { 1366 errno = ret; 1367 log_warn("%s: could not create event thread", __func__); 1368 return (ret); 1369 } 1370 pthread_set_name_np(evtid, "event"); 1371 1372 for (;;) { 1373 ret = pthread_cond_wait(&threadcond, &threadmutex); 1374 if (ret) { 1375 log_warn("%s: waiting on thread state condition " 1376 "variable failed", __func__); 1377 return (ret); 1378 } 1379 1380 /* 1381 * Did a VCPU thread exit with an error? => return the first one 1382 */ 1383 for (i = 0; i < vcp->vcp_ncpus; i++) { 1384 if (vcpu_done[i] == 0) 1385 continue; 1386 1387 if (pthread_join(tid[i], &exit_status)) { 1388 log_warn("%s: failed to join thread %zd - " 1389 "exiting", __progname, i); 1390 return (EIO); 1391 } 1392 1393 ret = (intptr_t)exit_status; 1394 } 1395 1396 /* Did the event thread exit? => return with an error */ 1397 if (evdone) { 1398 if (pthread_join(evtid, &exit_status)) { 1399 log_warn("%s: failed to join event thread - " 1400 "exiting", __progname); 1401 return (EIO); 1402 } 1403 1404 log_warnx("%s: vm %d event thread exited " 1405 "unexpectedly", __progname, vcp->vcp_id); 1406 return (EIO); 1407 } 1408 1409 /* Did all VCPU threads exit successfully? => return */ 1410 for (i = 0; i < vcp->vcp_ncpus; i++) { 1411 if (vcpu_done[i] == 0) 1412 break; 1413 } 1414 if (i == vcp->vcp_ncpus) 1415 return (ret); 1416 1417 /* Some more threads to wait for, start over */ 1418 } 1419 1420 return (ret); 1421 } 1422 1423 void * 1424 event_thread(void *arg) 1425 { 1426 uint8_t *donep = arg; 1427 intptr_t ret; 1428 1429 ret = event_dispatch(); 1430 1431 mutex_lock(&threadmutex); 1432 *donep = 1; 1433 pthread_cond_signal(&threadcond); 1434 mutex_unlock(&threadmutex); 1435 1436 return (void *)ret; 1437 } 1438 1439 /* 1440 * vcpu_run_loop 1441 * 1442 * Runs a single VCPU until vmm(4) requires help handling an exit, 1443 * or the VM terminates. 1444 * 1445 * Parameters: 1446 * arg: vcpu_run_params for the VCPU being run by this thread 1447 * 1448 * Return values: 1449 * NULL: the VCPU shutdown properly 1450 * !NULL: error processing VCPU run, or the VCPU shutdown abnormally 1451 */ 1452 void * 1453 vcpu_run_loop(void *arg) 1454 { 1455 struct vm_run_params *vrp = (struct vm_run_params *)arg; 1456 intptr_t ret = 0; 1457 int irq; 1458 uint32_t n; 1459 1460 vrp->vrp_continue = 0; 1461 n = vrp->vrp_vcpu_id; 1462 1463 for (;;) { 1464 ret = pthread_mutex_lock(&vcpu_run_mtx[n]); 1465 1466 if (ret) { 1467 log_warnx("%s: can't lock vcpu run mtx (%d)", 1468 __func__, (int)ret); 1469 return ((void *)ret); 1470 } 1471 1472 /* If we are halted and need to pause, pause */ 1473 if (vcpu_hlt[n] && (current_vm->vm_state & VM_STATE_PAUSED)) { 1474 ret = pthread_barrier_wait(&vm_pause_barrier); 1475 if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) { 1476 log_warnx("%s: could not wait on pause barrier (%d)", 1477 __func__, (int)ret); 1478 return ((void *)ret); 1479 } 1480 1481 ret = pthread_mutex_lock(&vcpu_unpause_mtx[n]); 1482 if (ret) { 1483 log_warnx("%s: can't lock vcpu unpause mtx (%d)", 1484 __func__, (int)ret); 1485 return ((void *)ret); 1486 } 1487 1488 ret = pthread_cond_wait(&vcpu_unpause_cond[n], 1489 &vcpu_unpause_mtx[n]); 1490 if (ret) { 1491 log_warnx( 1492 "%s: can't wait on unpause cond (%d)", 1493 __func__, (int)ret); 1494 break; 1495 } 1496 ret = pthread_mutex_unlock(&vcpu_unpause_mtx[n]); 1497 if (ret) { 1498 log_warnx("%s: can't unlock unpause mtx (%d)", 1499 __func__, (int)ret); 1500 break; 1501 } 1502 } 1503 1504 /* If we are halted and not paused, wait */ 1505 if (vcpu_hlt[n]) { 1506 ret = pthread_cond_wait(&vcpu_run_cond[n], 1507 &vcpu_run_mtx[n]); 1508 1509 if (ret) { 1510 log_warnx( 1511 "%s: can't wait on cond (%d)", 1512 __func__, (int)ret); 1513 (void)pthread_mutex_unlock( 1514 &vcpu_run_mtx[n]); 1515 break; 1516 } 1517 } 1518 1519 ret = pthread_mutex_unlock(&vcpu_run_mtx[n]); 1520 1521 if (ret) { 1522 log_warnx("%s: can't unlock mutex on cond (%d)", 1523 __func__, (int)ret); 1524 break; 1525 } 1526 1527 if (vrp->vrp_irqready && i8259_is_pending()) { 1528 irq = i8259_ack(); 1529 vrp->vrp_irq = irq; 1530 } else 1531 vrp->vrp_irq = 0xFFFF; 1532 1533 /* Still more pending? */ 1534 if (i8259_is_pending()) { 1535 /* 1536 * XXX can probably avoid ioctls here by providing intr 1537 * in vrp 1538 */ 1539 if (vcpu_pic_intr(vrp->vrp_vm_id, 1540 vrp->vrp_vcpu_id, 1)) { 1541 fatal("can't set INTR"); 1542 } 1543 } else { 1544 if (vcpu_pic_intr(vrp->vrp_vm_id, 1545 vrp->vrp_vcpu_id, 0)) { 1546 fatal("can't clear INTR"); 1547 } 1548 } 1549 1550 if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) == -1) { 1551 /* If run ioctl failed, exit */ 1552 ret = errno; 1553 log_warn("%s: vm %d / vcpu %d run ioctl failed", 1554 __func__, vrp->vrp_vm_id, n); 1555 break; 1556 } 1557 1558 /* If the VM is terminating, exit normally */ 1559 if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) { 1560 ret = (intptr_t)NULL; 1561 break; 1562 } 1563 1564 if (vrp->vrp_exit_reason != VM_EXIT_NONE) { 1565 /* 1566 * vmm(4) needs help handling an exit, handle in 1567 * vcpu_exit. 1568 */ 1569 ret = vcpu_exit(vrp); 1570 if (ret) 1571 break; 1572 } 1573 } 1574 1575 mutex_lock(&threadmutex); 1576 vcpu_done[n] = 1; 1577 pthread_cond_signal(&threadcond); 1578 mutex_unlock(&threadmutex); 1579 1580 return ((void *)ret); 1581 } 1582 1583 int 1584 vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr) 1585 { 1586 struct vm_intr_params vip; 1587 1588 memset(&vip, 0, sizeof(vip)); 1589 1590 vip.vip_vm_id = vm_id; 1591 vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */ 1592 vip.vip_intr = intr; 1593 1594 if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) == -1) 1595 return (errno); 1596 1597 return (0); 1598 } 1599 1600 /* 1601 * vcpu_exit_pci 1602 * 1603 * Handle all I/O to the emulated PCI subsystem. 1604 * 1605 * Parameters: 1606 * vrp: vcpu run parameters containing guest state for this exit 1607 * 1608 * Return value: 1609 * Interrupt to inject to the guest VM, or 0xFF if no interrupt should 1610 * be injected. 1611 */ 1612 uint8_t 1613 vcpu_exit_pci(struct vm_run_params *vrp) 1614 { 1615 struct vm_exit *vei = vrp->vrp_exit; 1616 uint8_t intr; 1617 1618 intr = 0xFF; 1619 1620 switch (vei->vei.vei_port) { 1621 case PCI_MODE1_ADDRESS_REG: 1622 pci_handle_address_reg(vrp); 1623 break; 1624 case PCI_MODE1_DATA_REG: 1625 case PCI_MODE1_DATA_REG + 1: 1626 case PCI_MODE1_DATA_REG + 2: 1627 case PCI_MODE1_DATA_REG + 3: 1628 pci_handle_data_reg(vrp); 1629 break; 1630 case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END: 1631 intr = pci_handle_io(vrp); 1632 break; 1633 default: 1634 log_warnx("%s: unknown PCI register 0x%llx", 1635 __progname, (uint64_t)vei->vei.vei_port); 1636 break; 1637 } 1638 1639 return (intr); 1640 } 1641 1642 /* 1643 * vcpu_exit_inout 1644 * 1645 * Handle all I/O exits that need to be emulated in vmd. This includes the 1646 * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device. 1647 * 1648 * Parameters: 1649 * vrp: vcpu run parameters containing guest state for this exit 1650 */ 1651 void 1652 vcpu_exit_inout(struct vm_run_params *vrp) 1653 { 1654 struct vm_exit *vei = vrp->vrp_exit; 1655 uint8_t intr = 0xFF; 1656 1657 if (vei->vei.vei_rep || vei->vei.vei_string) { 1658 #ifdef MMIO_DEBUG 1659 log_info("%s: %s%s%s %d-byte, enc=%d, data=0x%08x, port=0x%04x", 1660 __func__, 1661 vei->vei.vei_rep == 0 ? "" : "REP ", 1662 vei->vei.vei_dir == VEI_DIR_IN ? "IN" : "OUT", 1663 vei->vei.vei_string == 0 ? "" : "S", 1664 vei->vei.vei_size, vei->vei.vei_encoding, 1665 vei->vei.vei_data, vei->vei.vei_port); 1666 log_info("%s: ECX = 0x%llx, RDX = 0x%llx, RSI = 0x%llx", 1667 __func__, 1668 vei->vrs.vrs_gprs[VCPU_REGS_RCX], 1669 vei->vrs.vrs_gprs[VCPU_REGS_RDX], 1670 vei->vrs.vrs_gprs[VCPU_REGS_RSI]); 1671 #endif /* MMIO_DEBUG */ 1672 fatalx("%s: can't emulate REP prefixed IN(S)/OUT(S)", 1673 __func__); 1674 } 1675 1676 if (ioports_map[vei->vei.vei_port] != NULL) 1677 intr = ioports_map[vei->vei.vei_port](vrp); 1678 else if (vei->vei.vei_dir == VEI_DIR_IN) 1679 set_return_data(vei, 0xFFFFFFFF); 1680 1681 if (intr != 0xFF) 1682 vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr); 1683 } 1684 1685 /* 1686 * vcpu_exit_eptviolation 1687 * 1688 * handle an EPT Violation 1689 * 1690 * Parameters: 1691 * vrp: vcpu run parameters containing guest state for this exit 1692 * 1693 * Return values: 1694 * 0: no action required 1695 * EFAULT: a protection fault occured, kill the vm. 1696 */ 1697 int 1698 vcpu_exit_eptviolation(struct vm_run_params *vrp) 1699 { 1700 struct vm_exit *ve = vrp->vrp_exit; 1701 int ret = 0; 1702 #if MMIO_NOTYET 1703 struct x86_insn insn; 1704 uint64_t va, pa; 1705 size_t len = 15; /* Max instruction length in x86. */ 1706 #endif /* MMIO_NOTYET */ 1707 switch (ve->vee.vee_fault_type) { 1708 case VEE_FAULT_HANDLED: 1709 log_debug("%s: fault already handled", __func__); 1710 break; 1711 1712 #if MMIO_NOTYET 1713 case VEE_FAULT_MMIO_ASSIST: 1714 /* Intel VMX might give us the length of the instruction. */ 1715 if (ve->vee.vee_insn_info & VEE_LEN_VALID) 1716 len = ve->vee.vee_insn_len; 1717 1718 if (len > 15) 1719 fatalx("%s: invalid instruction length %lu", __func__, 1720 len); 1721 1722 /* If we weren't given instruction bytes, we need to fetch. */ 1723 if (!(ve->vee.vee_insn_info & VEE_BYTES_VALID)) { 1724 memset(ve->vee.vee_insn_bytes, 0, 1725 sizeof(ve->vee.vee_insn_bytes)); 1726 va = ve->vrs.vrs_gprs[VCPU_REGS_RIP]; 1727 1728 /* XXX Only support instructions that fit on 1 page. */ 1729 if ((va & PAGE_MASK) + len > PAGE_SIZE) { 1730 log_warnx("%s: instruction might cross page " 1731 "boundary", __func__); 1732 ret = EINVAL; 1733 break; 1734 } 1735 1736 ret = translate_gva(ve, va, &pa, PROT_EXEC); 1737 if (ret != 0) { 1738 log_warnx("%s: failed gva translation", 1739 __func__); 1740 break; 1741 } 1742 1743 ret = read_mem(pa, ve->vee.vee_insn_bytes, len); 1744 if (ret != 0) { 1745 log_warnx("%s: failed to fetch instruction " 1746 "bytes from 0x%llx", __func__, pa); 1747 break; 1748 } 1749 } 1750 1751 ret = insn_decode(ve, &insn); 1752 if (ret == 0) 1753 ret = insn_emulate(ve, &insn); 1754 break; 1755 #endif /* MMIO_NOTYET */ 1756 1757 case VEE_FAULT_PROTECT: 1758 log_debug("%s: EPT Violation: rip=0x%llx", __progname, 1759 ve->vrs.vrs_gprs[VCPU_REGS_RIP]); 1760 ret = EFAULT; 1761 break; 1762 1763 default: 1764 fatalx("%s: invalid fault_type %d", __progname, 1765 ve->vee.vee_fault_type); 1766 /* UNREACHED */ 1767 } 1768 1769 return (ret); 1770 } 1771 1772 /* 1773 * vcpu_exit 1774 * 1775 * Handle a vcpu exit. This function is called when it is determined that 1776 * vmm(4) requires the assistance of vmd to support a particular guest 1777 * exit type (eg, accessing an I/O port or device). Guest state is contained 1778 * in 'vrp', and will be resent to vmm(4) on exit completion. 1779 * 1780 * Upon conclusion of handling the exit, the function determines if any 1781 * interrupts should be injected into the guest, and asserts the proper 1782 * IRQ line whose interrupt should be vectored. 1783 * 1784 * Parameters: 1785 * vrp: vcpu run parameters containing guest state for this exit 1786 * 1787 * Return values: 1788 * 0: the exit was handled successfully 1789 * 1: an error occurred (eg, unknown exit reason passed in 'vrp') 1790 */ 1791 int 1792 vcpu_exit(struct vm_run_params *vrp) 1793 { 1794 int ret; 1795 1796 switch (vrp->vrp_exit_reason) { 1797 case VMX_EXIT_INT_WINDOW: 1798 case SVM_VMEXIT_VINTR: 1799 case VMX_EXIT_CPUID: 1800 case VMX_EXIT_EXTINT: 1801 case SVM_VMEXIT_INTR: 1802 case SVM_VMEXIT_MSR: 1803 case SVM_VMEXIT_CPUID: 1804 /* 1805 * We may be exiting to vmd to handle a pending interrupt but 1806 * at the same time the last exit type may have been one of 1807 * these. In this case, there's nothing extra to be done 1808 * here (and falling through to the default case below results 1809 * in more vmd log spam). 1810 */ 1811 break; 1812 case SVM_VMEXIT_NPF: 1813 case VMX_EXIT_EPT_VIOLATION: 1814 ret = vcpu_exit_eptviolation(vrp); 1815 if (ret) 1816 return (ret); 1817 break; 1818 case VMX_EXIT_IO: 1819 case SVM_VMEXIT_IOIO: 1820 vcpu_exit_inout(vrp); 1821 break; 1822 case VMX_EXIT_HLT: 1823 case SVM_VMEXIT_HLT: 1824 ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]); 1825 if (ret) { 1826 log_warnx("%s: can't lock vcpu mutex (%d)", 1827 __func__, ret); 1828 return (ret); 1829 } 1830 vcpu_hlt[vrp->vrp_vcpu_id] = 1; 1831 ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]); 1832 if (ret) { 1833 log_warnx("%s: can't unlock vcpu mutex (%d)", 1834 __func__, ret); 1835 return (ret); 1836 } 1837 break; 1838 case VMX_EXIT_TRIPLE_FAULT: 1839 case SVM_VMEXIT_SHUTDOWN: 1840 /* reset VM */ 1841 return (EAGAIN); 1842 default: 1843 log_debug("%s: unknown exit reason 0x%x", 1844 __progname, vrp->vrp_exit_reason); 1845 } 1846 1847 vrp->vrp_continue = 1; 1848 1849 return (0); 1850 } 1851 1852 /* 1853 * find_gpa_range 1854 * 1855 * Search for a contiguous guest physical mem range. 1856 * 1857 * Parameters: 1858 * vcp: VM create parameters that contain the memory map to search in 1859 * gpa: the starting guest physical address 1860 * len: the length of the memory range 1861 * 1862 * Return values: 1863 * NULL: on failure if there is no memory range as described by the parameters 1864 * Pointer to vm_mem_range that contains the start of the range otherwise. 1865 */ 1866 static struct vm_mem_range * 1867 find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len) 1868 { 1869 size_t i, n; 1870 struct vm_mem_range *vmr; 1871 1872 /* Find the first vm_mem_range that contains gpa */ 1873 for (i = 0; i < vcp->vcp_nmemranges; i++) { 1874 vmr = &vcp->vcp_memranges[i]; 1875 if (gpa < vmr->vmr_gpa + vmr->vmr_size) 1876 break; 1877 } 1878 1879 /* No range found. */ 1880 if (i == vcp->vcp_nmemranges) 1881 return (NULL); 1882 1883 /* 1884 * vmr may cover the range [gpa, gpa + len) only partly. Make 1885 * sure that the following vm_mem_ranges are contiguous and 1886 * cover the rest. 1887 */ 1888 n = vmr->vmr_size - (gpa - vmr->vmr_gpa); 1889 if (len < n) 1890 len = 0; 1891 else 1892 len -= n; 1893 gpa = vmr->vmr_gpa + vmr->vmr_size; 1894 for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) { 1895 vmr = &vcp->vcp_memranges[i]; 1896 if (gpa != vmr->vmr_gpa) 1897 return (NULL); 1898 if (len <= vmr->vmr_size) 1899 len = 0; 1900 else 1901 len -= vmr->vmr_size; 1902 1903 gpa = vmr->vmr_gpa + vmr->vmr_size; 1904 } 1905 1906 if (len != 0) 1907 return (NULL); 1908 1909 return (vmr); 1910 } 1911 1912 /* 1913 * write_mem 1914 * 1915 * Copies data from 'buf' into the guest VM's memory at paddr 'dst'. 1916 * 1917 * Parameters: 1918 * dst: the destination paddr_t in the guest VM 1919 * buf: data to copy (or NULL to zero the data) 1920 * len: number of bytes to copy 1921 * 1922 * Return values: 1923 * 0: success 1924 * EINVAL: if the guest physical memory range [dst, dst + len) does not 1925 * exist in the guest. 1926 */ 1927 int 1928 write_mem(paddr_t dst, const void *buf, size_t len) 1929 { 1930 const char *from = buf; 1931 char *to; 1932 size_t n, off; 1933 struct vm_mem_range *vmr; 1934 1935 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, dst, len); 1936 if (vmr == NULL) { 1937 errno = EINVAL; 1938 log_warn("%s: failed - invalid memory range dst = 0x%lx, " 1939 "len = 0x%zx", __func__, dst, len); 1940 return (EINVAL); 1941 } 1942 1943 off = dst - vmr->vmr_gpa; 1944 while (len != 0) { 1945 n = vmr->vmr_size - off; 1946 if (len < n) 1947 n = len; 1948 1949 to = (char *)vmr->vmr_va + off; 1950 if (buf == NULL) 1951 memset(to, 0, n); 1952 else { 1953 memcpy(to, from, n); 1954 from += n; 1955 } 1956 len -= n; 1957 off = 0; 1958 vmr++; 1959 } 1960 1961 return (0); 1962 } 1963 1964 /* 1965 * read_mem 1966 * 1967 * Reads memory at guest paddr 'src' into 'buf'. 1968 * 1969 * Parameters: 1970 * src: the source paddr_t in the guest VM to read from. 1971 * buf: destination (local) buffer 1972 * len: number of bytes to read 1973 * 1974 * Return values: 1975 * 0: success 1976 * EINVAL: if the guest physical memory range [dst, dst + len) does not 1977 * exist in the guest. 1978 */ 1979 int 1980 read_mem(paddr_t src, void *buf, size_t len) 1981 { 1982 char *from, *to = buf; 1983 size_t n, off; 1984 struct vm_mem_range *vmr; 1985 1986 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, src, len); 1987 if (vmr == NULL) { 1988 errno = EINVAL; 1989 log_warn("%s: failed - invalid memory range src = 0x%lx, " 1990 "len = 0x%zx", __func__, src, len); 1991 return (EINVAL); 1992 } 1993 1994 off = src - vmr->vmr_gpa; 1995 while (len != 0) { 1996 n = vmr->vmr_size - off; 1997 if (len < n) 1998 n = len; 1999 2000 from = (char *)vmr->vmr_va + off; 2001 memcpy(to, from, n); 2002 2003 to += n; 2004 len -= n; 2005 off = 0; 2006 vmr++; 2007 } 2008 2009 return (0); 2010 } 2011 2012 /* 2013 * hvaddr_mem 2014 * 2015 * Translate a guest physical address to a host virtual address, checking the 2016 * provided memory range length to confirm it's contiguous within the same 2017 * guest memory range (vm_mem_range). 2018 * 2019 * Parameters: 2020 * gpa: guest physical address to translate 2021 * len: number of bytes in the intended range 2022 * 2023 * Return values: 2024 * void* to host virtual memory on success 2025 * NULL on error, setting errno to: 2026 * EFAULT: gpa falls outside guest memory ranges 2027 * EINVAL: requested len extends beyond memory range 2028 */ 2029 void * 2030 hvaddr_mem(paddr_t gpa, size_t len) 2031 { 2032 struct vm_mem_range *vmr; 2033 size_t off; 2034 2035 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, gpa, len); 2036 if (vmr == NULL) { 2037 log_warnx("%s: failed - invalid gpa: 0x%lx\n", __func__, gpa); 2038 errno = EFAULT; 2039 return (NULL); 2040 } 2041 2042 off = gpa - vmr->vmr_gpa; 2043 if (len > (vmr->vmr_size - off)) { 2044 log_warnx("%s: failed - invalid memory range: gpa=0x%lx, " 2045 "len=%zu", __func__, gpa, len); 2046 errno = EINVAL; 2047 return (NULL); 2048 } 2049 2050 return ((char *)vmr->vmr_va + off); 2051 } 2052 2053 /* 2054 * vcpu_assert_pic_irq 2055 * 2056 * Injects the specified IRQ on the supplied vcpu/vm 2057 * 2058 * Parameters: 2059 * vm_id: VM ID to inject to 2060 * vcpu_id: VCPU ID to inject to 2061 * irq: IRQ to inject 2062 */ 2063 void 2064 vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) 2065 { 2066 int ret; 2067 2068 i8259_assert_irq(irq); 2069 2070 if (i8259_is_pending()) { 2071 if (vcpu_pic_intr(vm_id, vcpu_id, 1)) 2072 fatalx("%s: can't assert INTR", __func__); 2073 2074 ret = pthread_mutex_lock(&vcpu_run_mtx[vcpu_id]); 2075 if (ret) 2076 fatalx("%s: can't lock vcpu mtx (%d)", __func__, ret); 2077 2078 vcpu_hlt[vcpu_id] = 0; 2079 ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]); 2080 if (ret) 2081 fatalx("%s: can't signal (%d)", __func__, ret); 2082 ret = pthread_mutex_unlock(&vcpu_run_mtx[vcpu_id]); 2083 if (ret) 2084 fatalx("%s: can't unlock vcpu mtx (%d)", __func__, ret); 2085 } 2086 } 2087 2088 /* 2089 * vcpu_deassert_pic_irq 2090 * 2091 * Clears the specified IRQ on the supplied vcpu/vm 2092 * 2093 * Parameters: 2094 * vm_id: VM ID to clear in 2095 * vcpu_id: VCPU ID to clear in 2096 * irq: IRQ to clear 2097 */ 2098 void 2099 vcpu_deassert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) 2100 { 2101 i8259_deassert_irq(irq); 2102 2103 if (!i8259_is_pending()) { 2104 if (vcpu_pic_intr(vm_id, vcpu_id, 0)) 2105 fatalx("%s: can't deassert INTR for vm_id %d, " 2106 "vcpu_id %d", __func__, vm_id, vcpu_id); 2107 } 2108 } 2109 2110 /* 2111 * fd_hasdata 2112 * 2113 * Determines if data can be read from a file descriptor. 2114 * 2115 * Parameters: 2116 * fd: the fd to check 2117 * 2118 * Return values: 2119 * 1 if data can be read from an fd, or 0 otherwise. 2120 */ 2121 int 2122 fd_hasdata(int fd) 2123 { 2124 struct pollfd pfd[1]; 2125 int nready, hasdata = 0; 2126 2127 pfd[0].fd = fd; 2128 pfd[0].events = POLLIN; 2129 nready = poll(pfd, 1, 0); 2130 if (nready == -1) 2131 log_warn("checking file descriptor for data failed"); 2132 else if (nready == 1 && pfd[0].revents & POLLIN) 2133 hasdata = 1; 2134 return (hasdata); 2135 } 2136 2137 /* 2138 * mutex_lock 2139 * 2140 * Wrapper function for pthread_mutex_lock that does error checking and that 2141 * exits on failure 2142 */ 2143 void 2144 mutex_lock(pthread_mutex_t *m) 2145 { 2146 int ret; 2147 2148 ret = pthread_mutex_lock(m); 2149 if (ret) { 2150 errno = ret; 2151 fatal("could not acquire mutex"); 2152 } 2153 } 2154 2155 /* 2156 * mutex_unlock 2157 * 2158 * Wrapper function for pthread_mutex_unlock that does error checking and that 2159 * exits on failure 2160 */ 2161 void 2162 mutex_unlock(pthread_mutex_t *m) 2163 { 2164 int ret; 2165 2166 ret = pthread_mutex_unlock(m); 2167 if (ret) { 2168 errno = ret; 2169 fatal("could not release mutex"); 2170 } 2171 } 2172 2173 /* 2174 * set_return_data 2175 * 2176 * Utility function for manipulating register data in vm exit info structs. This 2177 * function ensures that the data is copied to the vei->vei.vei_data field with 2178 * the proper size for the operation being performed. 2179 * 2180 * Parameters: 2181 * vei: exit information 2182 * data: return data 2183 */ 2184 void 2185 set_return_data(struct vm_exit *vei, uint32_t data) 2186 { 2187 switch (vei->vei.vei_size) { 2188 case 1: 2189 vei->vei.vei_data &= ~0xFF; 2190 vei->vei.vei_data |= (uint8_t)data; 2191 break; 2192 case 2: 2193 vei->vei.vei_data &= ~0xFFFF; 2194 vei->vei.vei_data |= (uint16_t)data; 2195 break; 2196 case 4: 2197 vei->vei.vei_data = data; 2198 break; 2199 } 2200 } 2201 2202 /* 2203 * get_input_data 2204 * 2205 * Utility function for manipulating register data in vm exit info 2206 * structs. This function ensures that the data is copied from the 2207 * vei->vei.vei_data field with the proper size for the operation being 2208 * performed. 2209 * 2210 * Parameters: 2211 * vei: exit information 2212 * data: location to store the result 2213 */ 2214 void 2215 get_input_data(struct vm_exit *vei, uint32_t *data) 2216 { 2217 switch (vei->vei.vei_size) { 2218 case 1: 2219 *data &= 0xFFFFFF00; 2220 *data |= (uint8_t)vei->vei.vei_data; 2221 break; 2222 case 2: 2223 *data &= 0xFFFF0000; 2224 *data |= (uint16_t)vei->vei.vei_data; 2225 break; 2226 case 4: 2227 *data = vei->vei.vei_data; 2228 break; 2229 default: 2230 log_warnx("%s: invalid i/o size %d", __func__, 2231 vei->vei.vei_size); 2232 } 2233 2234 } 2235 2236 /* 2237 * translate_gva 2238 * 2239 * Translates a guest virtual address to a guest physical address by walking 2240 * the currently active page table (if needed). 2241 * 2242 * XXX ensure translate_gva updates the A bit in the PTE 2243 * XXX ensure translate_gva respects segment base and limits in i386 mode 2244 * XXX ensure translate_gva respects segment wraparound in i8086 mode 2245 * XXX ensure translate_gva updates the A bit in the segment selector 2246 * XXX ensure translate_gva respects CR4.LMSLE if available 2247 * 2248 * Parameters: 2249 * exit: The VCPU this translation should be performed for (guest MMU settings 2250 * are gathered from this VCPU) 2251 * va: virtual address to translate 2252 * pa: pointer to paddr_t variable that will receive the translated physical 2253 * address. 'pa' is unchanged on error. 2254 * mode: one of PROT_READ, PROT_WRITE, PROT_EXEC indicating the mode in which 2255 * the address should be translated 2256 * 2257 * Return values: 2258 * 0: the address was successfully translated - 'pa' contains the physical 2259 * address currently mapped by 'va'. 2260 * EFAULT: the PTE for 'VA' is unmapped. A #PF will be injected in this case 2261 * and %cr2 set in the vcpu structure. 2262 * EINVAL: an error occurred reading paging table structures 2263 */ 2264 int 2265 translate_gva(struct vm_exit* exit, uint64_t va, uint64_t* pa, int mode) 2266 { 2267 int level, shift, pdidx; 2268 uint64_t pte, pt_paddr, pte_paddr, mask, low_mask, high_mask; 2269 uint64_t shift_width, pte_size; 2270 struct vcpu_reg_state *vrs; 2271 2272 vrs = &exit->vrs; 2273 2274 if (!pa) 2275 return (EINVAL); 2276 2277 if (!(vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PG)) { 2278 log_debug("%s: unpaged, va=pa=0x%llx", __func__, va); 2279 *pa = va; 2280 return (0); 2281 } 2282 2283 pt_paddr = vrs->vrs_crs[VCPU_REGS_CR3]; 2284 2285 log_debug("%s: guest %%cr0=0x%llx, %%cr3=0x%llx", __func__, 2286 vrs->vrs_crs[VCPU_REGS_CR0], vrs->vrs_crs[VCPU_REGS_CR3]); 2287 2288 if (vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PE) { 2289 if (vrs->vrs_crs[VCPU_REGS_CR4] & CR4_PAE) { 2290 pte_size = sizeof(uint64_t); 2291 shift_width = 9; 2292 2293 if (vrs->vrs_msrs[VCPU_REGS_EFER] & EFER_LMA) { 2294 /* 4 level paging */ 2295 level = 4; 2296 mask = L4_MASK; 2297 shift = L4_SHIFT; 2298 } else { 2299 /* 32 bit with PAE paging */ 2300 level = 3; 2301 mask = L3_MASK; 2302 shift = L3_SHIFT; 2303 } 2304 } else { 2305 /* 32 bit paging */ 2306 level = 2; 2307 shift_width = 10; 2308 mask = 0xFFC00000; 2309 shift = 22; 2310 pte_size = sizeof(uint32_t); 2311 } 2312 } else 2313 return (EINVAL); 2314 2315 /* XXX: Check for R bit in segment selector and set A bit */ 2316 2317 for (;level > 0; level--) { 2318 pdidx = (va & mask) >> shift; 2319 pte_paddr = (pt_paddr) + (pdidx * pte_size); 2320 2321 log_debug("%s: read pte level %d @ GPA 0x%llx", __func__, 2322 level, pte_paddr); 2323 if (read_mem(pte_paddr, &pte, pte_size)) { 2324 log_warn("%s: failed to read pte", __func__); 2325 return (EFAULT); 2326 } 2327 2328 log_debug("%s: PTE @ 0x%llx = 0x%llx", __func__, pte_paddr, 2329 pte); 2330 2331 /* XXX: Set CR2 */ 2332 if (!(pte & PG_V)) 2333 return (EFAULT); 2334 2335 /* XXX: Check for SMAP */ 2336 if ((mode == PROT_WRITE) && !(pte & PG_RW)) 2337 return (EPERM); 2338 2339 if ((exit->cpl > 0) && !(pte & PG_u)) 2340 return (EPERM); 2341 2342 pte = pte | PG_U; 2343 if (mode == PROT_WRITE) 2344 pte = pte | PG_M; 2345 if (write_mem(pte_paddr, &pte, pte_size)) { 2346 log_warn("%s: failed to write back flags to pte", 2347 __func__); 2348 return (EIO); 2349 } 2350 2351 /* XXX: EINVAL if in 32bit and PG_PS is 1 but CR4.PSE is 0 */ 2352 if (pte & PG_PS) 2353 break; 2354 2355 if (level > 1) { 2356 pt_paddr = pte & PG_FRAME; 2357 shift -= shift_width; 2358 mask = mask >> shift_width; 2359 } 2360 } 2361 2362 low_mask = (1 << shift) - 1; 2363 high_mask = (((uint64_t)1ULL << ((pte_size * 8) - 1)) - 1) ^ low_mask; 2364 *pa = (pte & high_mask) | (va & low_mask); 2365 2366 log_debug("%s: final GPA for GVA 0x%llx = 0x%llx\n", __func__, va, *pa); 2367 2368 return (0); 2369 } 2370 2371 /* 2372 * vm_pipe_init 2373 * 2374 * Initialize a vm_dev_pipe, setting up its file descriptors and its 2375 * event structure with the given callback. 2376 * 2377 * Parameters: 2378 * p: pointer to vm_dev_pipe struct to initizlize 2379 * cb: callback to use for READ events on the read end of the pipe 2380 */ 2381 void 2382 vm_pipe_init(struct vm_dev_pipe *p, void (*cb)(int, short, void *)) 2383 { 2384 int ret; 2385 int fds[2]; 2386 2387 memset(p, 0, sizeof(struct vm_dev_pipe)); 2388 2389 ret = pipe(fds); 2390 if (ret) 2391 fatal("failed to create vm_dev_pipe pipe"); 2392 2393 p->read = fds[0]; 2394 p->write = fds[1]; 2395 2396 event_set(&p->read_ev, p->read, EV_READ | EV_PERSIST, cb, NULL); 2397 } 2398 2399 /* 2400 * vm_pipe_send 2401 * 2402 * Send a message to an emulated device vie the provided vm_dev_pipe. 2403 * 2404 * Parameters: 2405 * p: pointer to initialized vm_dev_pipe 2406 * msg: message to send in the channel 2407 */ 2408 void 2409 vm_pipe_send(struct vm_dev_pipe *p, enum pipe_msg_type msg) 2410 { 2411 size_t n; 2412 n = write(p->write, &msg, sizeof(msg)); 2413 if (n != sizeof(msg)) 2414 fatal("failed to write to device pipe"); 2415 } 2416 2417 /* 2418 * vm_pipe_recv 2419 * 2420 * Receive a message for an emulated device via the provided vm_dev_pipe. 2421 * Returns the message value, otherwise will exit on failure. 2422 * 2423 * Parameters: 2424 * p: pointer to initialized vm_dev_pipe 2425 * 2426 * Return values: 2427 * a value of enum pipe_msg_type or fatal exit on read(2) error 2428 */ 2429 enum pipe_msg_type 2430 vm_pipe_recv(struct vm_dev_pipe *p) 2431 { 2432 size_t n; 2433 enum pipe_msg_type msg; 2434 n = read(p->read, &msg, sizeof(msg)); 2435 if (n != sizeof(msg)) 2436 fatal("failed to read from device pipe"); 2437 2438 return msg; 2439 } 2440