1 /* $OpenBSD: vm.c,v 1.65 2021/09/01 11:08:21 dv Exp $ */ 2 3 /* 4 * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/types.h> 20 #include <sys/ioctl.h> 21 #include <sys/queue.h> 22 #include <sys/wait.h> 23 #include <sys/uio.h> 24 #include <sys/stat.h> 25 #include <sys/socket.h> 26 #include <sys/time.h> 27 #include <sys/mman.h> 28 29 #include <dev/ic/i8253reg.h> 30 #include <dev/isa/isareg.h> 31 #include <dev/pci/pcireg.h> 32 33 #include <machine/param.h> 34 #include <machine/psl.h> 35 #include <machine/pte.h> 36 #include <machine/specialreg.h> 37 #include <machine/vmmvar.h> 38 39 #include <net/if.h> 40 41 #include <errno.h> 42 #include <event.h> 43 #include <fcntl.h> 44 #include <imsg.h> 45 #include <limits.h> 46 #include <poll.h> 47 #include <pthread.h> 48 #include <stddef.h> 49 #include <stdio.h> 50 #include <stdlib.h> 51 #include <string.h> 52 #include <unistd.h> 53 #include <util.h> 54 55 #include "atomicio.h" 56 #include "fw_cfg.h" 57 #include "i8253.h" 58 #include "i8259.h" 59 #include "loadfile.h" 60 #include "mc146818.h" 61 #include "ns8250.h" 62 #include "pci.h" 63 #include "virtio.h" 64 #include "vmd.h" 65 #include "vmm.h" 66 67 io_fn_t ioports_map[MAX_PORTS]; 68 69 int run_vm(int, int[][VM_MAX_BASE_PER_DISK], int *, 70 struct vmop_create_params *, struct vcpu_reg_state *); 71 void vm_dispatch_vmm(int, short, void *); 72 void *event_thread(void *); 73 void *vcpu_run_loop(void *); 74 int vcpu_exit(struct vm_run_params *); 75 int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *); 76 void create_memory_map(struct vm_create_params *); 77 int alloc_guest_mem(struct vm_create_params *); 78 int vmm_create_vm(struct vm_create_params *); 79 void init_emulated_hw(struct vmop_create_params *, int, 80 int[][VM_MAX_BASE_PER_DISK], int *); 81 void restore_emulated_hw(struct vm_create_params *, int, int *, 82 int[][VM_MAX_BASE_PER_DISK],int); 83 void vcpu_exit_inout(struct vm_run_params *); 84 int vcpu_exit_eptviolation(struct vm_run_params *); 85 uint8_t vcpu_exit_pci(struct vm_run_params *); 86 int vcpu_pic_intr(uint32_t, uint32_t, uint8_t); 87 int loadfile_bios(gzFile, off_t, struct vcpu_reg_state *); 88 int send_vm(int, struct vm_create_params *); 89 int dump_send_header(int); 90 int dump_vmr(int , struct vm_mem_range *); 91 int dump_mem(int, struct vm_create_params *); 92 void restore_vmr(int, struct vm_mem_range *); 93 void restore_mem(int, struct vm_create_params *); 94 int restore_vm_params(int, struct vm_create_params *); 95 void pause_vm(struct vm_create_params *); 96 void unpause_vm(struct vm_create_params *); 97 98 int translate_gva(struct vm_exit*, uint64_t, uint64_t *, int); 99 100 static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t, 101 size_t); 102 103 int con_fd; 104 struct vmd_vm *current_vm; 105 106 extern struct vmd *env; 107 108 extern char *__progname; 109 110 pthread_mutex_t threadmutex; 111 pthread_cond_t threadcond; 112 113 pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM]; 114 pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM]; 115 pthread_barrier_t vm_pause_barrier; 116 pthread_cond_t vcpu_unpause_cond[VMM_MAX_VCPUS_PER_VM]; 117 pthread_mutex_t vcpu_unpause_mtx[VMM_MAX_VCPUS_PER_VM]; 118 uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM]; 119 uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM]; 120 121 /* 122 * Represents a standard register set for an OS to be booted 123 * as a flat 64 bit address space. 124 * 125 * NOT set here are: 126 * RIP 127 * RSP 128 * GDTR BASE 129 * 130 * Specific bootloaders should clone this structure and override 131 * those fields as needed. 132 * 133 * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on 134 * features of the CPU in use. 135 */ 136 static const struct vcpu_reg_state vcpu_init_flat64 = { 137 .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2, 138 .vrs_gprs[VCPU_REGS_RIP] = 0x0, 139 .vrs_gprs[VCPU_REGS_RSP] = 0x0, 140 .vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE | CR0_PG, 141 .vrs_crs[VCPU_REGS_CR3] = PML4_PAGE, 142 .vrs_crs[VCPU_REGS_CR4] = CR4_PAE | CR4_PSE, 143 .vrs_crs[VCPU_REGS_PDPTE0] = 0ULL, 144 .vrs_crs[VCPU_REGS_PDPTE1] = 0ULL, 145 .vrs_crs[VCPU_REGS_PDPTE2] = 0ULL, 146 .vrs_crs[VCPU_REGS_PDPTE3] = 0ULL, 147 .vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0}, 148 .vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 149 .vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 150 .vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 151 .vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 152 .vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 153 .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0}, 154 .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0}, 155 .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0}, 156 .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0}, 157 .vrs_msrs[VCPU_REGS_EFER] = EFER_LME | EFER_LMA, 158 .vrs_drs[VCPU_REGS_DR0] = 0x0, 159 .vrs_drs[VCPU_REGS_DR1] = 0x0, 160 .vrs_drs[VCPU_REGS_DR2] = 0x0, 161 .vrs_drs[VCPU_REGS_DR3] = 0x0, 162 .vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0, 163 .vrs_drs[VCPU_REGS_DR7] = 0x400, 164 .vrs_msrs[VCPU_REGS_STAR] = 0ULL, 165 .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL, 166 .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL, 167 .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL, 168 .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL, 169 .vrs_msrs[VCPU_REGS_MISC_ENABLE] = 0ULL, 170 .vrs_crs[VCPU_REGS_XCR0] = XCR0_X87 171 }; 172 173 /* 174 * Represents a standard register set for an BIOS to be booted 175 * as a flat 16 bit address space. 176 */ 177 static const struct vcpu_reg_state vcpu_init_flat16 = { 178 .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2, 179 .vrs_gprs[VCPU_REGS_RIP] = 0xFFF0, 180 .vrs_gprs[VCPU_REGS_RSP] = 0x0, 181 .vrs_crs[VCPU_REGS_CR0] = 0x60000010, 182 .vrs_crs[VCPU_REGS_CR3] = 0, 183 .vrs_sregs[VCPU_REGS_CS] = { 0xF000, 0xFFFF, 0x809F, 0xF0000}, 184 .vrs_sregs[VCPU_REGS_DS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 185 .vrs_sregs[VCPU_REGS_ES] = { 0x0, 0xFFFF, 0x8093, 0x0}, 186 .vrs_sregs[VCPU_REGS_FS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 187 .vrs_sregs[VCPU_REGS_GS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 188 .vrs_sregs[VCPU_REGS_SS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 189 .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0}, 190 .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0}, 191 .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0}, 192 .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0}, 193 .vrs_msrs[VCPU_REGS_EFER] = 0ULL, 194 .vrs_drs[VCPU_REGS_DR0] = 0x0, 195 .vrs_drs[VCPU_REGS_DR1] = 0x0, 196 .vrs_drs[VCPU_REGS_DR2] = 0x0, 197 .vrs_drs[VCPU_REGS_DR3] = 0x0, 198 .vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0, 199 .vrs_drs[VCPU_REGS_DR7] = 0x400, 200 .vrs_msrs[VCPU_REGS_STAR] = 0ULL, 201 .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL, 202 .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL, 203 .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL, 204 .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL, 205 .vrs_crs[VCPU_REGS_XCR0] = XCR0_X87 206 }; 207 208 /* 209 * loadfile_bios 210 * 211 * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image 212 * directly into memory. 213 * 214 * Parameters: 215 * fp: file of a kernel file to load 216 * size: uncompressed size of the image 217 * (out) vrs: register state to set on init for this kernel 218 * 219 * Return values: 220 * 0 if successful 221 * various error codes returned from read(2) or loadelf functions 222 */ 223 int 224 loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_state *vrs) 225 { 226 off_t off; 227 228 /* Set up a "flat 16 bit" register state for BIOS */ 229 memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs)); 230 231 /* Seek to the beginning of the BIOS image */ 232 if (gzseek(fp, 0, SEEK_SET) == -1) 233 return (-1); 234 235 /* The BIOS image must end at 1M */ 236 if ((off = 1048576 - size) < 0) 237 return (-1); 238 239 /* Read BIOS image into memory */ 240 if (mread(fp, off, size) != (size_t)size) { 241 errno = EIO; 242 return (-1); 243 } 244 245 log_debug("%s: loaded BIOS image", __func__); 246 247 return (0); 248 } 249 250 /* 251 * start_vm 252 * 253 * After forking a new VM process, starts the new VM with the creation 254 * parameters supplied (in the incoming vm->vm_params field). This 255 * function performs a basic sanity check on the incoming parameters 256 * and then performs the following steps to complete the creation of the VM: 257 * 258 * 1. validates and create the new VM 259 * 2. opens the imsg control channel to the parent and drops more privilege 260 * 3. drops additional privleges by calling pledge(2) 261 * 4. loads the kernel from the disk image or file descriptor 262 * 5. runs the VM's VCPU loops. 263 * 264 * Parameters: 265 * vm: The VM data structure that is including the VM create parameters. 266 * fd: The imsg socket that is connected to the parent process. 267 * 268 * Return values: 269 * 0: success 270 * !0 : failure - typically an errno indicating the source of the failure 271 */ 272 int 273 start_vm(struct vmd_vm *vm, int fd) 274 { 275 struct vmop_create_params *vmc = &vm->vm_params; 276 struct vm_create_params *vcp = &vmc->vmc_params; 277 struct vcpu_reg_state vrs; 278 int nicfds[VMM_MAX_NICS_PER_VM]; 279 int ret; 280 gzFile fp; 281 size_t i; 282 struct vm_rwregs_params vrp; 283 struct stat sb; 284 285 /* Child */ 286 setproctitle("%s", vcp->vcp_name); 287 log_procinit(vcp->vcp_name); 288 289 if (!(vm->vm_state & VM_STATE_RECEIVED)) 290 create_memory_map(vcp); 291 292 ret = alloc_guest_mem(vcp); 293 294 if (ret) { 295 errno = ret; 296 fatal("could not allocate guest memory - exiting"); 297 } 298 299 ret = vmm_create_vm(vcp); 300 current_vm = vm; 301 302 /* send back the kernel-generated vm id (0 on error) */ 303 if (write(fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) != 304 sizeof(vcp->vcp_id)) 305 fatal("write vcp id"); 306 307 if (ret) { 308 errno = ret; 309 fatal("create vmm ioctl failed - exiting"); 310 } 311 312 /* 313 * pledge in the vm processes: 314 * stdio - for malloc and basic I/O including events. 315 * recvfd - for send/recv. 316 * vmm - for the vmm ioctls and operations. 317 */ 318 if (pledge("stdio vmm recvfd", NULL) == -1) 319 fatal("pledge"); 320 321 if (vm->vm_state & VM_STATE_RECEIVED) { 322 ret = read(vm->vm_receive_fd, &vrp, sizeof(vrp)); 323 if (ret != sizeof(vrp)) { 324 fatal("received incomplete vrp - exiting"); 325 } 326 vrs = vrp.vrwp_regs; 327 } else { 328 /* 329 * Set up default "flat 64 bit" register state - RIP, 330 * RSP, and GDT info will be set in bootloader 331 */ 332 memcpy(&vrs, &vcpu_init_flat64, sizeof(vrs)); 333 334 /* Find and open kernel image */ 335 if ((fp = gzdopen(vm->vm_kernel, "r")) == NULL) 336 fatalx("failed to open kernel - exiting"); 337 338 /* Load kernel image */ 339 ret = loadfile_elf(fp, vcp, &vrs); 340 341 /* 342 * Try BIOS as a fallback (only if it was provided as an image 343 * with vm->vm_kernel and the file is not compressed) 344 */ 345 if (ret && errno == ENOEXEC && vm->vm_kernel != -1 && 346 gzdirect(fp) && (ret = fstat(vm->vm_kernel, &sb)) == 0) 347 ret = loadfile_bios(fp, sb.st_size, &vrs); 348 349 if (ret) 350 fatal("failed to load kernel or BIOS - exiting"); 351 352 gzclose(fp); 353 } 354 355 if (vm->vm_kernel != -1) 356 close(vm->vm_kernel); 357 358 con_fd = vm->vm_tty; 359 if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1) 360 fatal("failed to set nonblocking mode on console"); 361 362 for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) 363 nicfds[i] = vm->vm_ifs[i].vif_fd; 364 365 event_init(); 366 367 if (vm->vm_state & VM_STATE_RECEIVED) { 368 restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds, 369 vm->vm_disks, vm->vm_cdrom); 370 restore_mem(vm->vm_receive_fd, vcp); 371 if (restore_vm_params(vm->vm_receive_fd, vcp)) 372 fatal("restore vm params failed"); 373 unpause_vm(vcp); 374 } 375 376 if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1) 377 fatal("setup vm pipe"); 378 379 /* Execute the vcpu run loop(s) for this VM */ 380 ret = run_vm(vm->vm_cdrom, vm->vm_disks, nicfds, &vm->vm_params, &vrs); 381 382 /* Ensure that any in-flight data is written back */ 383 virtio_shutdown(vm); 384 385 return (ret); 386 } 387 388 /* 389 * vm_dispatch_vmm 390 * 391 * imsg callback for messages that are received from the vmm parent process. 392 */ 393 void 394 vm_dispatch_vmm(int fd, short event, void *arg) 395 { 396 struct vmd_vm *vm = arg; 397 struct vmop_result vmr; 398 struct vmop_addr_result var; 399 struct imsgev *iev = &vm->vm_iev; 400 struct imsgbuf *ibuf = &iev->ibuf; 401 struct imsg imsg; 402 ssize_t n; 403 int verbose; 404 405 if (event & EV_READ) { 406 if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN) 407 fatal("%s: imsg_read", __func__); 408 if (n == 0) 409 _exit(0); 410 } 411 412 if (event & EV_WRITE) { 413 if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN) 414 fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd); 415 if (n == 0) 416 _exit(0); 417 } 418 419 for (;;) { 420 if ((n = imsg_get(ibuf, &imsg)) == -1) 421 fatal("%s: imsg_get", __func__); 422 if (n == 0) 423 break; 424 425 #if DEBUG > 1 426 log_debug("%s: got imsg %d from %s", 427 __func__, imsg.hdr.type, 428 vm->vm_params.vmc_params.vcp_name); 429 #endif 430 431 switch (imsg.hdr.type) { 432 case IMSG_CTL_VERBOSE: 433 IMSG_SIZE_CHECK(&imsg, &verbose); 434 memcpy(&verbose, imsg.data, sizeof(verbose)); 435 log_setverbose(verbose); 436 break; 437 case IMSG_VMDOP_VM_SHUTDOWN: 438 if (vmmci_ctl(VMMCI_SHUTDOWN) == -1) 439 _exit(0); 440 break; 441 case IMSG_VMDOP_VM_REBOOT: 442 if (vmmci_ctl(VMMCI_REBOOT) == -1) 443 _exit(0); 444 break; 445 case IMSG_VMDOP_PAUSE_VM: 446 vmr.vmr_result = 0; 447 vmr.vmr_id = vm->vm_vmid; 448 pause_vm(&vm->vm_params.vmc_params); 449 imsg_compose_event(&vm->vm_iev, 450 IMSG_VMDOP_PAUSE_VM_RESPONSE, 451 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 452 sizeof(vmr)); 453 break; 454 case IMSG_VMDOP_UNPAUSE_VM: 455 vmr.vmr_result = 0; 456 vmr.vmr_id = vm->vm_vmid; 457 unpause_vm(&vm->vm_params.vmc_params); 458 imsg_compose_event(&vm->vm_iev, 459 IMSG_VMDOP_UNPAUSE_VM_RESPONSE, 460 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 461 sizeof(vmr)); 462 break; 463 case IMSG_VMDOP_SEND_VM_REQUEST: 464 vmr.vmr_id = vm->vm_vmid; 465 vmr.vmr_result = send_vm(imsg.fd, 466 &vm->vm_params.vmc_params); 467 imsg_compose_event(&vm->vm_iev, 468 IMSG_VMDOP_SEND_VM_RESPONSE, 469 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 470 sizeof(vmr)); 471 if (!vmr.vmr_result) { 472 imsg_flush(¤t_vm->vm_iev.ibuf); 473 _exit(0); 474 } 475 break; 476 case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE: 477 IMSG_SIZE_CHECK(&imsg, &var); 478 memcpy(&var, imsg.data, sizeof(var)); 479 480 log_debug("%s: received tap addr %s for nic %d", 481 vm->vm_params.vmc_params.vcp_name, 482 ether_ntoa((void *)var.var_addr), var.var_nic_idx); 483 484 vionet_set_hostmac(vm, var.var_nic_idx, var.var_addr); 485 break; 486 default: 487 fatalx("%s: got invalid imsg %d from %s", 488 __func__, imsg.hdr.type, 489 vm->vm_params.vmc_params.vcp_name); 490 } 491 imsg_free(&imsg); 492 } 493 imsg_event_add(iev); 494 } 495 496 /* 497 * vm_shutdown 498 * 499 * Tell the vmm parent process to shutdown or reboot the VM and exit. 500 */ 501 __dead void 502 vm_shutdown(unsigned int cmd) 503 { 504 switch (cmd) { 505 case VMMCI_NONE: 506 case VMMCI_SHUTDOWN: 507 (void)imsg_compose_event(¤t_vm->vm_iev, 508 IMSG_VMDOP_VM_SHUTDOWN, 0, 0, -1, NULL, 0); 509 break; 510 case VMMCI_REBOOT: 511 (void)imsg_compose_event(¤t_vm->vm_iev, 512 IMSG_VMDOP_VM_REBOOT, 0, 0, -1, NULL, 0); 513 break; 514 default: 515 fatalx("invalid vm ctl command: %d", cmd); 516 } 517 imsg_flush(¤t_vm->vm_iev.ibuf); 518 519 _exit(0); 520 } 521 522 int 523 send_vm(int fd, struct vm_create_params *vcp) 524 { 525 struct vm_rwregs_params vrp; 526 struct vm_rwvmparams_params vpp; 527 struct vmop_create_params *vmc; 528 struct vm_terminate_params vtp; 529 unsigned int flags = 0; 530 unsigned int i; 531 int ret = 0; 532 size_t sz; 533 534 if (dump_send_header(fd)) { 535 log_info("%s: failed to send vm dump header", __func__); 536 goto err; 537 } 538 539 pause_vm(vcp); 540 541 vmc = calloc(1, sizeof(struct vmop_create_params)); 542 if (vmc == NULL) { 543 log_warn("%s: calloc error geting vmc", __func__); 544 ret = -1; 545 goto err; 546 } 547 548 flags |= VMOP_CREATE_MEMORY; 549 memcpy(&vmc->vmc_params, ¤t_vm->vm_params, sizeof(struct 550 vmop_create_params)); 551 vmc->vmc_flags = flags; 552 vrp.vrwp_vm_id = vcp->vcp_id; 553 vrp.vrwp_mask = VM_RWREGS_ALL; 554 vpp.vpp_mask = VM_RWVMPARAMS_ALL; 555 vpp.vpp_vm_id = vcp->vcp_id; 556 557 sz = atomicio(vwrite, fd, vmc,sizeof(struct vmop_create_params)); 558 if (sz != sizeof(struct vmop_create_params)) { 559 ret = -1; 560 goto err; 561 } 562 563 for (i = 0; i < vcp->vcp_ncpus; i++) { 564 vrp.vrwp_vcpu_id = i; 565 if ((ret = ioctl(env->vmd_fd, VMM_IOC_READREGS, &vrp))) { 566 log_warn("%s: readregs failed", __func__); 567 goto err; 568 } 569 570 sz = atomicio(vwrite, fd, &vrp, 571 sizeof(struct vm_rwregs_params)); 572 if (sz != sizeof(struct vm_rwregs_params)) { 573 log_warn("%s: dumping registers failed", __func__); 574 ret = -1; 575 goto err; 576 } 577 } 578 579 if ((ret = i8253_dump(fd))) 580 goto err; 581 if ((ret = i8259_dump(fd))) 582 goto err; 583 if ((ret = ns8250_dump(fd))) 584 goto err; 585 if ((ret = mc146818_dump(fd))) 586 goto err; 587 if ((ret = fw_cfg_dump(fd))) 588 goto err; 589 if ((ret = pci_dump(fd))) 590 goto err; 591 if ((ret = virtio_dump(fd))) 592 goto err; 593 if ((ret = dump_mem(fd, vcp))) 594 goto err; 595 596 for (i = 0; i < vcp->vcp_ncpus; i++) { 597 vpp.vpp_vcpu_id = i; 598 if ((ret = ioctl(env->vmd_fd, VMM_IOC_READVMPARAMS, &vpp))) { 599 log_warn("%s: readvmparams failed", __func__); 600 goto err; 601 } 602 603 sz = atomicio(vwrite, fd, &vpp, 604 sizeof(struct vm_rwvmparams_params)); 605 if (sz != sizeof(struct vm_rwvmparams_params)) { 606 log_warn("%s: dumping vm params failed", __func__); 607 ret = -1; 608 goto err; 609 } 610 } 611 612 vtp.vtp_vm_id = vcp->vcp_id; 613 if (ioctl(env->vmd_fd, VMM_IOC_TERM, &vtp) == -1) { 614 log_warnx("%s: term IOC error: %d, %d", __func__, 615 errno, ENOENT); 616 } 617 err: 618 close(fd); 619 if (ret) 620 unpause_vm(vcp); 621 return ret; 622 } 623 624 int 625 dump_send_header(int fd) { 626 struct vm_dump_header vmh; 627 int i; 628 629 memcpy(&vmh.vmh_signature, VM_DUMP_SIGNATURE, 630 sizeof(vmh.vmh_signature)); 631 632 vmh.vmh_cpuids[0].code = 0x00; 633 vmh.vmh_cpuids[0].leaf = 0x00; 634 635 vmh.vmh_cpuids[1].code = 0x01; 636 vmh.vmh_cpuids[1].leaf = 0x00; 637 638 vmh.vmh_cpuids[2].code = 0x07; 639 vmh.vmh_cpuids[2].leaf = 0x00; 640 641 vmh.vmh_cpuids[3].code = 0x0d; 642 vmh.vmh_cpuids[3].leaf = 0x00; 643 644 vmh.vmh_cpuids[4].code = 0x80000001; 645 vmh.vmh_cpuids[4].leaf = 0x00; 646 647 vmh.vmh_version = VM_DUMP_VERSION; 648 649 for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) { 650 CPUID_LEAF(vmh.vmh_cpuids[i].code, 651 vmh.vmh_cpuids[i].leaf, 652 vmh.vmh_cpuids[i].a, 653 vmh.vmh_cpuids[i].b, 654 vmh.vmh_cpuids[i].c, 655 vmh.vmh_cpuids[i].d); 656 } 657 658 if (atomicio(vwrite, fd, &vmh, sizeof(vmh)) != sizeof(vmh)) 659 return (-1); 660 661 return (0); 662 } 663 664 int 665 dump_mem(int fd, struct vm_create_params *vcp) 666 { 667 unsigned int i; 668 int ret; 669 struct vm_mem_range *vmr; 670 671 for (i = 0; i < vcp->vcp_nmemranges; i++) { 672 vmr = &vcp->vcp_memranges[i]; 673 ret = dump_vmr(fd, vmr); 674 if (ret) 675 return ret; 676 } 677 return (0); 678 } 679 680 int 681 restore_vm_params(int fd, struct vm_create_params *vcp) { 682 unsigned int i; 683 struct vm_rwvmparams_params vpp; 684 685 for (i = 0; i < vcp->vcp_ncpus; i++) { 686 if (atomicio(read, fd, &vpp, sizeof(vpp)) != sizeof(vpp)) { 687 log_warn("%s: error restoring vm params", __func__); 688 return (-1); 689 } 690 vpp.vpp_vm_id = vcp->vcp_id; 691 vpp.vpp_vcpu_id = i; 692 if (ioctl(env->vmd_fd, VMM_IOC_WRITEVMPARAMS, &vpp) < 0) { 693 log_debug("%s: writing vm params failed", __func__); 694 return (-1); 695 } 696 } 697 return (0); 698 } 699 700 void 701 restore_mem(int fd, struct vm_create_params *vcp) 702 { 703 unsigned int i; 704 struct vm_mem_range *vmr; 705 706 for (i = 0; i < vcp->vcp_nmemranges; i++) { 707 vmr = &vcp->vcp_memranges[i]; 708 restore_vmr(fd, vmr); 709 } 710 } 711 712 int 713 dump_vmr(int fd, struct vm_mem_range *vmr) 714 { 715 size_t rem = vmr->vmr_size, read=0; 716 char buf[PAGE_SIZE]; 717 718 while (rem > 0) { 719 if (read_mem(vmr->vmr_gpa + read, buf, PAGE_SIZE)) { 720 log_warn("failed to read vmr"); 721 return (-1); 722 } 723 if (atomicio(vwrite, fd, buf, sizeof(buf)) != sizeof(buf)) { 724 log_warn("failed to dump vmr"); 725 return (-1); 726 } 727 rem = rem - PAGE_SIZE; 728 read = read + PAGE_SIZE; 729 } 730 return (0); 731 } 732 733 void 734 restore_vmr(int fd, struct vm_mem_range *vmr) 735 { 736 size_t rem = vmr->vmr_size, wrote=0; 737 char buf[PAGE_SIZE]; 738 739 while (rem > 0) { 740 if (atomicio(read, fd, buf, sizeof(buf)) != sizeof(buf)) 741 fatal("failed to restore vmr"); 742 if (write_mem(vmr->vmr_gpa + wrote, buf, PAGE_SIZE)) 743 fatal("failed to write vmr"); 744 rem = rem - PAGE_SIZE; 745 wrote = wrote + PAGE_SIZE; 746 } 747 } 748 749 void 750 pause_vm(struct vm_create_params *vcp) 751 { 752 unsigned int n; 753 int ret; 754 if (current_vm->vm_state & VM_STATE_PAUSED) 755 return; 756 757 current_vm->vm_state |= VM_STATE_PAUSED; 758 759 ret = pthread_barrier_init(&vm_pause_barrier, NULL, vcp->vcp_ncpus + 1); 760 if (ret) { 761 log_warnx("%s: cannot initialize pause barrier (%d)", 762 __progname, ret); 763 return; 764 } 765 766 for (n = 0; n < vcp->vcp_ncpus; n++) { 767 ret = pthread_cond_broadcast(&vcpu_run_cond[n]); 768 if (ret) { 769 log_warnx("%s: can't broadcast vcpu run cond (%d)", 770 __func__, (int)ret); 771 return; 772 } 773 } 774 ret = pthread_barrier_wait(&vm_pause_barrier); 775 if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) { 776 log_warnx("%s: could not wait on pause barrier (%d)", 777 __func__, (int)ret); 778 return; 779 } 780 781 ret = pthread_barrier_destroy(&vm_pause_barrier); 782 if (ret) { 783 log_warnx("%s: could not destroy pause barrier (%d)", 784 __progname, ret); 785 return; 786 } 787 788 i8253_stop(); 789 mc146818_stop(); 790 ns8250_stop(); 791 virtio_stop(vcp); 792 } 793 794 void 795 unpause_vm(struct vm_create_params *vcp) 796 { 797 unsigned int n; 798 int ret; 799 if (!(current_vm->vm_state & VM_STATE_PAUSED)) 800 return; 801 802 current_vm->vm_state &= ~VM_STATE_PAUSED; 803 for (n = 0; n < vcp->vcp_ncpus; n++) { 804 ret = pthread_cond_broadcast(&vcpu_unpause_cond[n]); 805 if (ret) { 806 log_warnx("%s: can't broadcast vcpu unpause cond (%d)", 807 __func__, (int)ret); 808 return; 809 } 810 } 811 812 i8253_start(); 813 mc146818_start(); 814 ns8250_start(); 815 virtio_start(vcp); 816 } 817 818 /* 819 * vcpu_reset 820 * 821 * Requests vmm(4) to reset the VCPUs in the indicated VM to 822 * the register state provided 823 * 824 * Parameters 825 * vmid: VM ID to reset 826 * vcpu_id: VCPU ID to reset 827 * vrs: the register state to initialize 828 * 829 * Return values: 830 * 0: success 831 * !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not 832 * valid) 833 */ 834 int 835 vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs) 836 { 837 struct vm_resetcpu_params vrp; 838 839 memset(&vrp, 0, sizeof(vrp)); 840 vrp.vrp_vm_id = vmid; 841 vrp.vrp_vcpu_id = vcpu_id; 842 memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state)); 843 844 log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid); 845 846 if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) == -1) 847 return (errno); 848 849 return (0); 850 } 851 852 /* 853 * create_memory_map 854 * 855 * Sets up the guest physical memory ranges that the VM can access. 856 * 857 * Parameters: 858 * vcp: VM create parameters describing the VM whose memory map 859 * is being created 860 * 861 * Return values: 862 * nothing 863 */ 864 void 865 create_memory_map(struct vm_create_params *vcp) 866 { 867 size_t len, mem_bytes, mem_mb; 868 869 mem_mb = vcp->vcp_memranges[0].vmr_size; 870 vcp->vcp_nmemranges = 0; 871 if (mem_mb < 1 || mem_mb > VMM_MAX_VM_MEM_SIZE) 872 return; 873 874 mem_bytes = mem_mb * 1024 * 1024; 875 876 /* First memory region: 0 - LOWMEM_KB (DOS low mem) */ 877 len = LOWMEM_KB * 1024; 878 vcp->vcp_memranges[0].vmr_gpa = 0x0; 879 vcp->vcp_memranges[0].vmr_size = len; 880 mem_bytes -= len; 881 882 /* 883 * Second memory region: LOWMEM_KB - 1MB. 884 * 885 * N.B. - Normally ROMs or parts of video RAM are mapped here. 886 * We have to add this region, because some systems 887 * unconditionally write to 0xb8000 (VGA RAM), and 888 * we need to make sure that vmm(4) permits accesses 889 * to it. So allocate guest memory for it. 890 */ 891 len = 0x100000 - LOWMEM_KB * 1024; 892 vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024; 893 vcp->vcp_memranges[1].vmr_size = len; 894 mem_bytes -= len; 895 896 /* Make sure that we do not place physical memory into MMIO ranges. */ 897 if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - 0x100000) 898 len = VMM_PCI_MMIO_BAR_BASE - 0x100000; 899 else 900 len = mem_bytes; 901 902 /* Third memory region: 1MB - (1MB + len) */ 903 vcp->vcp_memranges[2].vmr_gpa = 0x100000; 904 vcp->vcp_memranges[2].vmr_size = len; 905 mem_bytes -= len; 906 907 if (mem_bytes > 0) { 908 /* Fourth memory region for the remaining memory (if any) */ 909 vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1; 910 vcp->vcp_memranges[3].vmr_size = mem_bytes; 911 vcp->vcp_nmemranges = 4; 912 } else 913 vcp->vcp_nmemranges = 3; 914 } 915 916 /* 917 * alloc_guest_mem 918 * 919 * Allocates memory for the guest. 920 * Instead of doing a single allocation with one mmap(), we allocate memory 921 * separately for every range for the following reasons: 922 * - ASLR for the individual ranges 923 * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to 924 * map the single mmap'd userspace memory to the individual guest physical 925 * memory ranges, the underlying amap of the single mmap'd range would have 926 * to allocate per-page reference counters. The reason is that the 927 * individual guest physical ranges would reference the single mmap'd region 928 * only partially. However, if every guest physical range has its own 929 * corresponding mmap'd userspace allocation, there are no partial 930 * references: every guest physical range fully references an mmap'd 931 * range => no per-page reference counters have to be allocated. 932 * 933 * Return values: 934 * 0: success 935 * !0: failure - errno indicating the source of the failure 936 */ 937 int 938 alloc_guest_mem(struct vm_create_params *vcp) 939 { 940 void *p; 941 int ret; 942 size_t i, j; 943 struct vm_mem_range *vmr; 944 945 for (i = 0; i < vcp->vcp_nmemranges; i++) { 946 vmr = &vcp->vcp_memranges[i]; 947 p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE, 948 MAP_PRIVATE | MAP_ANON, -1, 0); 949 if (p == MAP_FAILED) { 950 ret = errno; 951 for (j = 0; j < i; j++) { 952 vmr = &vcp->vcp_memranges[j]; 953 munmap((void *)vmr->vmr_va, vmr->vmr_size); 954 } 955 956 return (ret); 957 } 958 959 vmr->vmr_va = (vaddr_t)p; 960 } 961 962 return (0); 963 } 964 965 /* 966 * vmm_create_vm 967 * 968 * Requests vmm(4) to create a new VM using the supplied creation 969 * parameters. This operation results in the creation of the in-kernel 970 * structures for the VM, but does not start the VM's vcpu(s). 971 * 972 * Parameters: 973 * vcp: vm_create_params struct containing the VM's desired creation 974 * configuration 975 * 976 * Return values: 977 * 0: success 978 * !0 : ioctl to vmm(4) failed 979 */ 980 int 981 vmm_create_vm(struct vm_create_params *vcp) 982 { 983 /* Sanity check arguments */ 984 if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) 985 return (EINVAL); 986 987 if (vcp->vcp_nmemranges == 0 || 988 vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES) 989 return (EINVAL); 990 991 if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) 992 return (EINVAL); 993 994 if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) 995 return (EINVAL); 996 997 if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) == -1) 998 return (errno); 999 1000 return (0); 1001 } 1002 1003 /* 1004 * init_emulated_hw 1005 * 1006 * Initializes the userspace hardware emulation 1007 */ 1008 void 1009 init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom, 1010 int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps) 1011 { 1012 struct vm_create_params *vcp = &vmc->vmc_params; 1013 int i; 1014 uint64_t memlo, memhi; 1015 1016 /* Calculate memory size for NVRAM registers */ 1017 memlo = memhi = 0; 1018 if (vcp->vcp_nmemranges > 2) 1019 memlo = vcp->vcp_memranges[2].vmr_size - 15 * 0x100000; 1020 1021 if (vcp->vcp_nmemranges > 3) 1022 memhi = vcp->vcp_memranges[3].vmr_size; 1023 1024 /* Reset the IO port map */ 1025 memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS); 1026 1027 /* Init i8253 PIT */ 1028 i8253_init(vcp->vcp_id); 1029 ioports_map[TIMER_CTRL] = vcpu_exit_i8253; 1030 ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253; 1031 ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253; 1032 ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253; 1033 ioports_map[PCKBC_AUX] = vcpu_exit_i8253_misc; 1034 1035 /* Init mc146818 RTC */ 1036 mc146818_init(vcp->vcp_id, memlo, memhi); 1037 ioports_map[IO_RTC] = vcpu_exit_mc146818; 1038 ioports_map[IO_RTC + 1] = vcpu_exit_mc146818; 1039 1040 /* Init master and slave PICs */ 1041 i8259_init(); 1042 ioports_map[IO_ICU1] = vcpu_exit_i8259; 1043 ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259; 1044 ioports_map[IO_ICU2] = vcpu_exit_i8259; 1045 ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259; 1046 ioports_map[ELCR0] = vcpu_exit_elcr; 1047 ioports_map[ELCR1] = vcpu_exit_elcr; 1048 1049 /* Init ns8250 UART */ 1050 ns8250_init(con_fd, vcp->vcp_id); 1051 for (i = COM1_DATA; i <= COM1_SCR; i++) 1052 ioports_map[i] = vcpu_exit_com; 1053 1054 /* Init QEMU fw_cfg interface */ 1055 fw_cfg_init(vmc); 1056 ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg; 1057 ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg; 1058 ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma; 1059 ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma; 1060 1061 /* Initialize PCI */ 1062 for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++) 1063 ioports_map[i] = vcpu_exit_pci; 1064 1065 ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci; 1066 ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci; 1067 ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci; 1068 ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci; 1069 ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci; 1070 pci_init(); 1071 1072 /* Initialize virtio devices */ 1073 virtio_init(current_vm, child_cdrom, child_disks, child_taps); 1074 } 1075 /* 1076 * restore_emulated_hw 1077 * 1078 * Restores the userspace hardware emulation from fd 1079 */ 1080 void 1081 restore_emulated_hw(struct vm_create_params *vcp, int fd, 1082 int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom) 1083 { 1084 /* struct vm_create_params *vcp = &vmc->vmc_params; */ 1085 int i; 1086 memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS); 1087 1088 /* Init i8253 PIT */ 1089 i8253_restore(fd, vcp->vcp_id); 1090 ioports_map[TIMER_CTRL] = vcpu_exit_i8253; 1091 ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253; 1092 ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253; 1093 ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253; 1094 1095 /* Init master and slave PICs */ 1096 i8259_restore(fd); 1097 ioports_map[IO_ICU1] = vcpu_exit_i8259; 1098 ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259; 1099 ioports_map[IO_ICU2] = vcpu_exit_i8259; 1100 ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259; 1101 1102 /* Init ns8250 UART */ 1103 ns8250_restore(fd, con_fd, vcp->vcp_id); 1104 for (i = COM1_DATA; i <= COM1_SCR; i++) 1105 ioports_map[i] = vcpu_exit_com; 1106 1107 /* Init mc146818 RTC */ 1108 mc146818_restore(fd, vcp->vcp_id); 1109 ioports_map[IO_RTC] = vcpu_exit_mc146818; 1110 ioports_map[IO_RTC + 1] = vcpu_exit_mc146818; 1111 1112 /* Init QEMU fw_cfg interface */ 1113 fw_cfg_restore(fd); 1114 ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg; 1115 ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg; 1116 ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma; 1117 ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma; 1118 1119 /* Initialize PCI */ 1120 for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++) 1121 ioports_map[i] = vcpu_exit_pci; 1122 1123 ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci; 1124 ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci; 1125 ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci; 1126 ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci; 1127 ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci; 1128 pci_restore(fd); 1129 virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps); 1130 } 1131 1132 /* 1133 * run_vm 1134 * 1135 * Runs the VM whose creation parameters are specified in vcp 1136 * 1137 * Parameters: 1138 * child_cdrom: previously-opened child ISO disk file descriptor 1139 * child_disks: previously-opened child VM disk file file descriptors 1140 * child_taps: previously-opened child tap file descriptors 1141 * vmc: vmop_create_params struct containing the VM's desired creation 1142 * configuration 1143 * vrs: VCPU register state to initialize 1144 * 1145 * Return values: 1146 * 0: the VM exited normally 1147 * !0 : the VM exited abnormally or failed to start 1148 */ 1149 int 1150 run_vm(int child_cdrom, int child_disks[][VM_MAX_BASE_PER_DISK], 1151 int *child_taps, struct vmop_create_params *vmc, 1152 struct vcpu_reg_state *vrs) 1153 { 1154 struct vm_create_params *vcp = &vmc->vmc_params; 1155 struct vm_rwregs_params vregsp; 1156 uint8_t evdone = 0; 1157 size_t i; 1158 int ret; 1159 pthread_t *tid, evtid; 1160 struct vm_run_params **vrp; 1161 void *exit_status; 1162 1163 if (vcp == NULL) 1164 return (EINVAL); 1165 1166 if (child_cdrom == -1 && strlen(vcp->vcp_cdrom)) 1167 return (EINVAL); 1168 1169 if (child_disks == NULL && vcp->vcp_ndisks != 0) 1170 return (EINVAL); 1171 1172 if (child_taps == NULL && vcp->vcp_nnics != 0) 1173 return (EINVAL); 1174 1175 if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) 1176 return (EINVAL); 1177 1178 if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) 1179 return (EINVAL); 1180 1181 if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) 1182 return (EINVAL); 1183 1184 if (vcp->vcp_nmemranges == 0 || 1185 vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES) 1186 return (EINVAL); 1187 1188 tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t)); 1189 vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *)); 1190 if (tid == NULL || vrp == NULL) { 1191 log_warn("%s: memory allocation error - exiting.", 1192 __progname); 1193 return (ENOMEM); 1194 } 1195 1196 log_debug("%s: initializing hardware for vm %s", __func__, 1197 vcp->vcp_name); 1198 1199 if (!(current_vm->vm_state & VM_STATE_RECEIVED)) 1200 init_emulated_hw(vmc, child_cdrom, child_disks, child_taps); 1201 1202 ret = pthread_mutex_init(&threadmutex, NULL); 1203 if (ret) { 1204 log_warn("%s: could not initialize thread state mutex", 1205 __func__); 1206 return (ret); 1207 } 1208 ret = pthread_cond_init(&threadcond, NULL); 1209 if (ret) { 1210 log_warn("%s: could not initialize thread state " 1211 "condition variable", __func__); 1212 return (ret); 1213 } 1214 1215 mutex_lock(&threadmutex); 1216 1217 log_debug("%s: starting vcpu threads for vm %s", __func__, 1218 vcp->vcp_name); 1219 1220 /* 1221 * Create and launch one thread for each VCPU. These threads may 1222 * migrate between PCPUs over time; the need to reload CPU state 1223 * in such situations is detected and performed by vmm(4) in the 1224 * kernel. 1225 */ 1226 for (i = 0 ; i < vcp->vcp_ncpus; i++) { 1227 vrp[i] = malloc(sizeof(struct vm_run_params)); 1228 if (vrp[i] == NULL) { 1229 log_warn("%s: memory allocation error - " 1230 "exiting.", __progname); 1231 /* caller will exit, so skip freeing */ 1232 return (ENOMEM); 1233 } 1234 vrp[i]->vrp_exit = malloc(sizeof(struct vm_exit)); 1235 if (vrp[i]->vrp_exit == NULL) { 1236 log_warn("%s: memory allocation error - " 1237 "exiting.", __progname); 1238 /* caller will exit, so skip freeing */ 1239 return (ENOMEM); 1240 } 1241 vrp[i]->vrp_vm_id = vcp->vcp_id; 1242 vrp[i]->vrp_vcpu_id = i; 1243 1244 if (vcpu_reset(vcp->vcp_id, i, vrs)) { 1245 log_warnx("%s: cannot reset VCPU %zu - exiting.", 1246 __progname, i); 1247 return (EIO); 1248 } 1249 1250 /* once more because reset_cpu changes regs */ 1251 if (current_vm->vm_state & VM_STATE_RECEIVED) { 1252 vregsp.vrwp_vm_id = vcp->vcp_id; 1253 vregsp.vrwp_vcpu_id = i; 1254 vregsp.vrwp_regs = *vrs; 1255 vregsp.vrwp_mask = VM_RWREGS_ALL; 1256 if ((ret = ioctl(env->vmd_fd, VMM_IOC_WRITEREGS, 1257 &vregsp)) == -1) { 1258 log_warn("%s: writeregs failed", __func__); 1259 return (ret); 1260 } 1261 } 1262 1263 ret = pthread_cond_init(&vcpu_run_cond[i], NULL); 1264 if (ret) { 1265 log_warnx("%s: cannot initialize cond var (%d)", 1266 __progname, ret); 1267 return (ret); 1268 } 1269 1270 ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL); 1271 if (ret) { 1272 log_warnx("%s: cannot initialize mtx (%d)", 1273 __progname, ret); 1274 return (ret); 1275 } 1276 1277 ret = pthread_cond_init(&vcpu_unpause_cond[i], NULL); 1278 if (ret) { 1279 log_warnx("%s: cannot initialize unpause var (%d)", 1280 __progname, ret); 1281 return (ret); 1282 } 1283 1284 ret = pthread_mutex_init(&vcpu_unpause_mtx[i], NULL); 1285 if (ret) { 1286 log_warnx("%s: cannot initialize unpause mtx (%d)", 1287 __progname, ret); 1288 return (ret); 1289 } 1290 1291 vcpu_hlt[i] = 0; 1292 1293 /* Start each VCPU run thread at vcpu_run_loop */ 1294 ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]); 1295 if (ret) { 1296 /* caller will _exit after this return */ 1297 ret = errno; 1298 log_warn("%s: could not create vcpu thread %zu", 1299 __func__, i); 1300 return (ret); 1301 } 1302 } 1303 1304 log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name); 1305 ret = pthread_create(&evtid, NULL, event_thread, &evdone); 1306 if (ret) { 1307 errno = ret; 1308 log_warn("%s: could not create event thread", __func__); 1309 return (ret); 1310 } 1311 1312 for (;;) { 1313 ret = pthread_cond_wait(&threadcond, &threadmutex); 1314 if (ret) { 1315 log_warn("%s: waiting on thread state condition " 1316 "variable failed", __func__); 1317 return (ret); 1318 } 1319 1320 /* 1321 * Did a VCPU thread exit with an error? => return the first one 1322 */ 1323 for (i = 0; i < vcp->vcp_ncpus; i++) { 1324 if (vcpu_done[i] == 0) 1325 continue; 1326 1327 if (pthread_join(tid[i], &exit_status)) { 1328 log_warn("%s: failed to join thread %zd - " 1329 "exiting", __progname, i); 1330 return (EIO); 1331 } 1332 1333 ret = (intptr_t)exit_status; 1334 } 1335 1336 /* Did the event thread exit? => return with an error */ 1337 if (evdone) { 1338 if (pthread_join(evtid, &exit_status)) { 1339 log_warn("%s: failed to join event thread - " 1340 "exiting", __progname); 1341 return (EIO); 1342 } 1343 1344 log_warnx("%s: vm %d event thread exited " 1345 "unexpectedly", __progname, vcp->vcp_id); 1346 return (EIO); 1347 } 1348 1349 /* Did all VCPU threads exit successfully? => return */ 1350 for (i = 0; i < vcp->vcp_ncpus; i++) { 1351 if (vcpu_done[i] == 0) 1352 break; 1353 } 1354 if (i == vcp->vcp_ncpus) 1355 return (ret); 1356 1357 /* Some more threads to wait for, start over */ 1358 } 1359 1360 return (ret); 1361 } 1362 1363 void * 1364 event_thread(void *arg) 1365 { 1366 uint8_t *donep = arg; 1367 intptr_t ret; 1368 1369 ret = event_dispatch(); 1370 1371 mutex_lock(&threadmutex); 1372 *donep = 1; 1373 pthread_cond_signal(&threadcond); 1374 mutex_unlock(&threadmutex); 1375 1376 return (void *)ret; 1377 } 1378 1379 /* 1380 * vcpu_run_loop 1381 * 1382 * Runs a single VCPU until vmm(4) requires help handling an exit, 1383 * or the VM terminates. 1384 * 1385 * Parameters: 1386 * arg: vcpu_run_params for the VCPU being run by this thread 1387 * 1388 * Return values: 1389 * NULL: the VCPU shutdown properly 1390 * !NULL: error processing VCPU run, or the VCPU shutdown abnormally 1391 */ 1392 void * 1393 vcpu_run_loop(void *arg) 1394 { 1395 struct vm_run_params *vrp = (struct vm_run_params *)arg; 1396 intptr_t ret = 0; 1397 int irq; 1398 uint32_t n; 1399 1400 vrp->vrp_continue = 0; 1401 n = vrp->vrp_vcpu_id; 1402 1403 for (;;) { 1404 ret = pthread_mutex_lock(&vcpu_run_mtx[n]); 1405 1406 if (ret) { 1407 log_warnx("%s: can't lock vcpu run mtx (%d)", 1408 __func__, (int)ret); 1409 return ((void *)ret); 1410 } 1411 1412 /* If we are halted and need to pause, pause */ 1413 if (vcpu_hlt[n] && (current_vm->vm_state & VM_STATE_PAUSED)) { 1414 ret = pthread_barrier_wait(&vm_pause_barrier); 1415 if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) { 1416 log_warnx("%s: could not wait on pause barrier (%d)", 1417 __func__, (int)ret); 1418 return ((void *)ret); 1419 } 1420 1421 ret = pthread_mutex_lock(&vcpu_unpause_mtx[n]); 1422 if (ret) { 1423 log_warnx("%s: can't lock vcpu unpause mtx (%d)", 1424 __func__, (int)ret); 1425 return ((void *)ret); 1426 } 1427 1428 ret = pthread_cond_wait(&vcpu_unpause_cond[n], 1429 &vcpu_unpause_mtx[n]); 1430 if (ret) { 1431 log_warnx( 1432 "%s: can't wait on unpause cond (%d)", 1433 __func__, (int)ret); 1434 break; 1435 } 1436 ret = pthread_mutex_unlock(&vcpu_unpause_mtx[n]); 1437 if (ret) { 1438 log_warnx("%s: can't unlock unpause mtx (%d)", 1439 __func__, (int)ret); 1440 break; 1441 } 1442 } 1443 1444 /* If we are halted and not paused, wait */ 1445 if (vcpu_hlt[n]) { 1446 ret = pthread_cond_wait(&vcpu_run_cond[n], 1447 &vcpu_run_mtx[n]); 1448 1449 if (ret) { 1450 log_warnx( 1451 "%s: can't wait on cond (%d)", 1452 __func__, (int)ret); 1453 (void)pthread_mutex_unlock( 1454 &vcpu_run_mtx[n]); 1455 break; 1456 } 1457 } 1458 1459 ret = pthread_mutex_unlock(&vcpu_run_mtx[n]); 1460 1461 if (ret) { 1462 log_warnx("%s: can't unlock mutex on cond (%d)", 1463 __func__, (int)ret); 1464 break; 1465 } 1466 1467 if (vrp->vrp_irqready && i8259_is_pending()) { 1468 irq = i8259_ack(); 1469 vrp->vrp_irq = irq; 1470 } else 1471 vrp->vrp_irq = 0xFFFF; 1472 1473 /* Still more pending? */ 1474 if (i8259_is_pending()) { 1475 /* 1476 * XXX can probably avoid ioctls here by providing intr 1477 * in vrp 1478 */ 1479 if (vcpu_pic_intr(vrp->vrp_vm_id, 1480 vrp->vrp_vcpu_id, 1)) { 1481 fatal("can't set INTR"); 1482 } 1483 } else { 1484 if (vcpu_pic_intr(vrp->vrp_vm_id, 1485 vrp->vrp_vcpu_id, 0)) { 1486 fatal("can't clear INTR"); 1487 } 1488 } 1489 1490 if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) == -1) { 1491 /* If run ioctl failed, exit */ 1492 ret = errno; 1493 log_warn("%s: vm %d / vcpu %d run ioctl failed", 1494 __func__, vrp->vrp_vm_id, n); 1495 break; 1496 } 1497 1498 /* If the VM is terminating, exit normally */ 1499 if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) { 1500 ret = (intptr_t)NULL; 1501 break; 1502 } 1503 1504 if (vrp->vrp_exit_reason != VM_EXIT_NONE) { 1505 /* 1506 * vmm(4) needs help handling an exit, handle in 1507 * vcpu_exit. 1508 */ 1509 ret = vcpu_exit(vrp); 1510 if (ret) 1511 break; 1512 } 1513 } 1514 1515 mutex_lock(&threadmutex); 1516 vcpu_done[n] = 1; 1517 pthread_cond_signal(&threadcond); 1518 mutex_unlock(&threadmutex); 1519 1520 return ((void *)ret); 1521 } 1522 1523 int 1524 vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr) 1525 { 1526 struct vm_intr_params vip; 1527 1528 memset(&vip, 0, sizeof(vip)); 1529 1530 vip.vip_vm_id = vm_id; 1531 vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */ 1532 vip.vip_intr = intr; 1533 1534 if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) == -1) 1535 return (errno); 1536 1537 return (0); 1538 } 1539 1540 /* 1541 * vcpu_exit_pci 1542 * 1543 * Handle all I/O to the emulated PCI subsystem. 1544 * 1545 * Parameters: 1546 * vrp: vcpu run paramters containing guest state for this exit 1547 * 1548 * Return value: 1549 * Interrupt to inject to the guest VM, or 0xFF if no interrupt should 1550 * be injected. 1551 */ 1552 uint8_t 1553 vcpu_exit_pci(struct vm_run_params *vrp) 1554 { 1555 struct vm_exit *vei = vrp->vrp_exit; 1556 uint8_t intr; 1557 1558 intr = 0xFF; 1559 1560 switch (vei->vei.vei_port) { 1561 case PCI_MODE1_ADDRESS_REG: 1562 pci_handle_address_reg(vrp); 1563 break; 1564 case PCI_MODE1_DATA_REG: 1565 case PCI_MODE1_DATA_REG + 1: 1566 case PCI_MODE1_DATA_REG + 2: 1567 case PCI_MODE1_DATA_REG + 3: 1568 pci_handle_data_reg(vrp); 1569 break; 1570 case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END: 1571 intr = pci_handle_io(vrp); 1572 break; 1573 default: 1574 log_warnx("%s: unknown PCI register 0x%llx", 1575 __progname, (uint64_t)vei->vei.vei_port); 1576 break; 1577 } 1578 1579 return (intr); 1580 } 1581 1582 /* 1583 * vcpu_exit_inout 1584 * 1585 * Handle all I/O exits that need to be emulated in vmd. This includes the 1586 * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device. 1587 * 1588 * Parameters: 1589 * vrp: vcpu run parameters containing guest state for this exit 1590 */ 1591 void 1592 vcpu_exit_inout(struct vm_run_params *vrp) 1593 { 1594 struct vm_exit *vei = vrp->vrp_exit; 1595 uint8_t intr = 0xFF; 1596 1597 if (ioports_map[vei->vei.vei_port] != NULL) 1598 intr = ioports_map[vei->vei.vei_port](vrp); 1599 else if (vei->vei.vei_dir == VEI_DIR_IN) 1600 set_return_data(vei, 0xFFFFFFFF); 1601 1602 if (intr != 0xFF) 1603 vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr); 1604 } 1605 1606 /* 1607 * vcpu_exit_eptviolation 1608 * 1609 * handle an EPT Violation 1610 * 1611 * Parameters: 1612 * vrp: vcpu run parameters containing guest state for this exit 1613 * 1614 * Return values: 1615 * 0: no action required 1616 * EAGAIN: a protection fault occured, kill the vm. 1617 */ 1618 int 1619 vcpu_exit_eptviolation(struct vm_run_params *vrp) 1620 { 1621 struct vm_exit *ve = vrp->vrp_exit; 1622 1623 /* 1624 * vmd may be exiting to vmd to handle a pending interrupt 1625 * but last exit type may have been VMX_EXIT_EPT_VIOLATION, 1626 * check the fault_type to ensure we really are processing 1627 * a VMX_EXIT_EPT_VIOLATION. 1628 */ 1629 if (ve->vee.vee_fault_type == VEE_FAULT_PROTECT) { 1630 log_debug("%s: EPT Violation: rip=0x%llx", 1631 __progname, vrp->vrp_exit->vrs.vrs_gprs[VCPU_REGS_RIP]); 1632 return (EAGAIN); 1633 } 1634 1635 return (0); 1636 } 1637 1638 /* 1639 * vcpu_exit 1640 * 1641 * Handle a vcpu exit. This function is called when it is determined that 1642 * vmm(4) requires the assistance of vmd to support a particular guest 1643 * exit type (eg, accessing an I/O port or device). Guest state is contained 1644 * in 'vrp', and will be resent to vmm(4) on exit completion. 1645 * 1646 * Upon conclusion of handling the exit, the function determines if any 1647 * interrupts should be injected into the guest, and asserts the proper 1648 * IRQ line whose interrupt should be vectored. 1649 * 1650 * Parameters: 1651 * vrp: vcpu run parameters containing guest state for this exit 1652 * 1653 * Return values: 1654 * 0: the exit was handled successfully 1655 * 1: an error occurred (eg, unknown exit reason passed in 'vrp') 1656 */ 1657 int 1658 vcpu_exit(struct vm_run_params *vrp) 1659 { 1660 int ret; 1661 1662 switch (vrp->vrp_exit_reason) { 1663 case VMX_EXIT_INT_WINDOW: 1664 case SVM_VMEXIT_VINTR: 1665 case VMX_EXIT_CPUID: 1666 case VMX_EXIT_EXTINT: 1667 case SVM_VMEXIT_INTR: 1668 case SVM_VMEXIT_NPF: 1669 case SVM_VMEXIT_MSR: 1670 case SVM_VMEXIT_CPUID: 1671 /* 1672 * We may be exiting to vmd to handle a pending interrupt but 1673 * at the same time the last exit type may have been one of 1674 * these. In this case, there's nothing extra to be done 1675 * here (and falling through to the default case below results 1676 * in more vmd log spam). 1677 */ 1678 break; 1679 case VMX_EXIT_EPT_VIOLATION: 1680 ret = vcpu_exit_eptviolation(vrp); 1681 if (ret) 1682 return (ret); 1683 1684 break; 1685 case VMX_EXIT_IO: 1686 case SVM_VMEXIT_IOIO: 1687 vcpu_exit_inout(vrp); 1688 break; 1689 case VMX_EXIT_HLT: 1690 case SVM_VMEXIT_HLT: 1691 ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]); 1692 if (ret) { 1693 log_warnx("%s: can't lock vcpu mutex (%d)", 1694 __func__, ret); 1695 return (ret); 1696 } 1697 vcpu_hlt[vrp->vrp_vcpu_id] = 1; 1698 ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]); 1699 if (ret) { 1700 log_warnx("%s: can't unlock vcpu mutex (%d)", 1701 __func__, ret); 1702 return (ret); 1703 } 1704 break; 1705 case VMX_EXIT_TRIPLE_FAULT: 1706 case SVM_VMEXIT_SHUTDOWN: 1707 /* reset VM */ 1708 return (EAGAIN); 1709 default: 1710 log_debug("%s: unknown exit reason 0x%x", 1711 __progname, vrp->vrp_exit_reason); 1712 } 1713 1714 vrp->vrp_continue = 1; 1715 1716 return (0); 1717 } 1718 1719 /* 1720 * find_gpa_range 1721 * 1722 * Search for a contiguous guest physical mem range. 1723 * 1724 * Parameters: 1725 * vcp: VM create parameters that contain the memory map to search in 1726 * gpa: the starting guest physical address 1727 * len: the length of the memory range 1728 * 1729 * Return values: 1730 * NULL: on failure if there is no memory range as described by the parameters 1731 * Pointer to vm_mem_range that contains the start of the range otherwise. 1732 */ 1733 static struct vm_mem_range * 1734 find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len) 1735 { 1736 size_t i, n; 1737 struct vm_mem_range *vmr; 1738 1739 /* Find the first vm_mem_range that contains gpa */ 1740 for (i = 0; i < vcp->vcp_nmemranges; i++) { 1741 vmr = &vcp->vcp_memranges[i]; 1742 if (vmr->vmr_gpa + vmr->vmr_size >= gpa) 1743 break; 1744 } 1745 1746 /* No range found. */ 1747 if (i == vcp->vcp_nmemranges) 1748 return (NULL); 1749 1750 /* 1751 * vmr may cover the range [gpa, gpa + len) only partly. Make 1752 * sure that the following vm_mem_ranges are contiguous and 1753 * cover the rest. 1754 */ 1755 n = vmr->vmr_size - (gpa - vmr->vmr_gpa); 1756 if (len < n) 1757 len = 0; 1758 else 1759 len -= n; 1760 gpa = vmr->vmr_gpa + vmr->vmr_size; 1761 for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) { 1762 vmr = &vcp->vcp_memranges[i]; 1763 if (gpa != vmr->vmr_gpa) 1764 return (NULL); 1765 if (len <= vmr->vmr_size) 1766 len = 0; 1767 else 1768 len -= vmr->vmr_size; 1769 1770 gpa = vmr->vmr_gpa + vmr->vmr_size; 1771 } 1772 1773 if (len != 0) 1774 return (NULL); 1775 1776 return (vmr); 1777 } 1778 1779 /* 1780 * write_mem 1781 * 1782 * Copies data from 'buf' into the guest VM's memory at paddr 'dst'. 1783 * 1784 * Parameters: 1785 * dst: the destination paddr_t in the guest VM 1786 * buf: data to copy (or NULL to zero the data) 1787 * len: number of bytes to copy 1788 * 1789 * Return values: 1790 * 0: success 1791 * EINVAL: if the guest physical memory range [dst, dst + len) does not 1792 * exist in the guest. 1793 */ 1794 int 1795 write_mem(paddr_t dst, const void *buf, size_t len) 1796 { 1797 const char *from = buf; 1798 char *to; 1799 size_t n, off; 1800 struct vm_mem_range *vmr; 1801 1802 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, dst, len); 1803 if (vmr == NULL) { 1804 errno = EINVAL; 1805 log_warn("%s: failed - invalid memory range dst = 0x%lx, " 1806 "len = 0x%zx", __func__, dst, len); 1807 return (EINVAL); 1808 } 1809 1810 off = dst - vmr->vmr_gpa; 1811 while (len != 0) { 1812 n = vmr->vmr_size - off; 1813 if (len < n) 1814 n = len; 1815 1816 to = (char *)vmr->vmr_va + off; 1817 if (buf == NULL) 1818 memset(to, 0, n); 1819 else { 1820 memcpy(to, from, n); 1821 from += n; 1822 } 1823 len -= n; 1824 off = 0; 1825 vmr++; 1826 } 1827 1828 return (0); 1829 } 1830 1831 /* 1832 * read_mem 1833 * 1834 * Reads memory at guest paddr 'src' into 'buf'. 1835 * 1836 * Parameters: 1837 * src: the source paddr_t in the guest VM to read from. 1838 * buf: destination (local) buffer 1839 * len: number of bytes to read 1840 * 1841 * Return values: 1842 * 0: success 1843 * EINVAL: if the guest physical memory range [dst, dst + len) does not 1844 * exist in the guest. 1845 */ 1846 int 1847 read_mem(paddr_t src, void *buf, size_t len) 1848 { 1849 char *from, *to = buf; 1850 size_t n, off; 1851 struct vm_mem_range *vmr; 1852 1853 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, src, len); 1854 if (vmr == NULL) { 1855 errno = EINVAL; 1856 log_warn("%s: failed - invalid memory range src = 0x%lx, " 1857 "len = 0x%zx", __func__, src, len); 1858 return (EINVAL); 1859 } 1860 1861 off = src - vmr->vmr_gpa; 1862 while (len != 0) { 1863 n = vmr->vmr_size - off; 1864 if (len < n) 1865 n = len; 1866 1867 from = (char *)vmr->vmr_va + off; 1868 memcpy(to, from, n); 1869 1870 to += n; 1871 len -= n; 1872 off = 0; 1873 vmr++; 1874 } 1875 1876 return (0); 1877 } 1878 1879 /* 1880 * vcpu_assert_pic_irq 1881 * 1882 * Injects the specified IRQ on the supplied vcpu/vm 1883 * 1884 * Parameters: 1885 * vm_id: VM ID to inject to 1886 * vcpu_id: VCPU ID to inject to 1887 * irq: IRQ to inject 1888 */ 1889 void 1890 vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) 1891 { 1892 int ret; 1893 1894 i8259_assert_irq(irq); 1895 1896 if (i8259_is_pending()) { 1897 if (vcpu_pic_intr(vm_id, vcpu_id, 1)) 1898 fatalx("%s: can't assert INTR", __func__); 1899 1900 ret = pthread_mutex_lock(&vcpu_run_mtx[vcpu_id]); 1901 if (ret) 1902 fatalx("%s: can't lock vcpu mtx (%d)", __func__, ret); 1903 1904 vcpu_hlt[vcpu_id] = 0; 1905 ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]); 1906 if (ret) 1907 fatalx("%s: can't signal (%d)", __func__, ret); 1908 ret = pthread_mutex_unlock(&vcpu_run_mtx[vcpu_id]); 1909 if (ret) 1910 fatalx("%s: can't unlock vcpu mtx (%d)", __func__, ret); 1911 } 1912 } 1913 1914 /* 1915 * vcpu_deassert_pic_irq 1916 * 1917 * Clears the specified IRQ on the supplied vcpu/vm 1918 * 1919 * Parameters: 1920 * vm_id: VM ID to clear in 1921 * vcpu_id: VCPU ID to clear in 1922 * irq: IRQ to clear 1923 */ 1924 void 1925 vcpu_deassert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) 1926 { 1927 i8259_deassert_irq(irq); 1928 1929 if (!i8259_is_pending()) { 1930 if (vcpu_pic_intr(vm_id, vcpu_id, 0)) 1931 fatalx("%s: can't deassert INTR for vm_id %d, " 1932 "vcpu_id %d", __func__, vm_id, vcpu_id); 1933 } 1934 } 1935 1936 /* 1937 * fd_hasdata 1938 * 1939 * Determines if data can be read from a file descriptor. 1940 * 1941 * Parameters: 1942 * fd: the fd to check 1943 * 1944 * Return values: 1945 * 1 if data can be read from an fd, or 0 otherwise. 1946 */ 1947 int 1948 fd_hasdata(int fd) 1949 { 1950 struct pollfd pfd[1]; 1951 int nready, hasdata = 0; 1952 1953 pfd[0].fd = fd; 1954 pfd[0].events = POLLIN; 1955 nready = poll(pfd, 1, 0); 1956 if (nready == -1) 1957 log_warn("checking file descriptor for data failed"); 1958 else if (nready == 1 && pfd[0].revents & POLLIN) 1959 hasdata = 1; 1960 return (hasdata); 1961 } 1962 1963 /* 1964 * mutex_lock 1965 * 1966 * Wrapper function for pthread_mutex_lock that does error checking and that 1967 * exits on failure 1968 */ 1969 void 1970 mutex_lock(pthread_mutex_t *m) 1971 { 1972 int ret; 1973 1974 ret = pthread_mutex_lock(m); 1975 if (ret) { 1976 errno = ret; 1977 fatal("could not acquire mutex"); 1978 } 1979 } 1980 1981 /* 1982 * mutex_unlock 1983 * 1984 * Wrapper function for pthread_mutex_unlock that does error checking and that 1985 * exits on failure 1986 */ 1987 void 1988 mutex_unlock(pthread_mutex_t *m) 1989 { 1990 int ret; 1991 1992 ret = pthread_mutex_unlock(m); 1993 if (ret) { 1994 errno = ret; 1995 fatal("could not release mutex"); 1996 } 1997 } 1998 1999 /* 2000 * set_return_data 2001 * 2002 * Utility function for manipulating register data in vm exit info structs. This 2003 * function ensures that the data is copied to the vei->vei.vei_data field with 2004 * the proper size for the operation being performed. 2005 * 2006 * Parameters: 2007 * vei: exit information 2008 * data: return data 2009 */ 2010 void 2011 set_return_data(struct vm_exit *vei, uint32_t data) 2012 { 2013 switch (vei->vei.vei_size) { 2014 case 1: 2015 vei->vei.vei_data &= ~0xFF; 2016 vei->vei.vei_data |= (uint8_t)data; 2017 break; 2018 case 2: 2019 vei->vei.vei_data &= ~0xFFFF; 2020 vei->vei.vei_data |= (uint16_t)data; 2021 break; 2022 case 4: 2023 vei->vei.vei_data = data; 2024 break; 2025 } 2026 } 2027 2028 /* 2029 * get_input_data 2030 * 2031 * Utility function for manipulating register data in vm exit info 2032 * structs. This function ensures that the data is copied from the 2033 * vei->vei.vei_data field with the proper size for the operation being 2034 * performed. 2035 * 2036 * Parameters: 2037 * vei: exit information 2038 * data: location to store the result 2039 */ 2040 void 2041 get_input_data(struct vm_exit *vei, uint32_t *data) 2042 { 2043 switch (vei->vei.vei_size) { 2044 case 1: 2045 *data &= 0xFFFFFF00; 2046 *data |= (uint8_t)vei->vei.vei_data; 2047 break; 2048 case 2: 2049 *data &= 0xFFFF0000; 2050 *data |= (uint16_t)vei->vei.vei_data; 2051 break; 2052 case 4: 2053 *data = vei->vei.vei_data; 2054 break; 2055 default: 2056 log_warnx("%s: invalid i/o size %d", __func__, 2057 vei->vei.vei_size); 2058 } 2059 2060 } 2061 2062 /* 2063 * translate_gva 2064 * 2065 * Translates a guest virtual address to a guest physical address by walking 2066 * the currently active page table (if needed). 2067 * 2068 * Note - this function can possibly alter the supplied VCPU state. 2069 * Specifically, it may inject exceptions depending on the current VCPU 2070 * configuration, and may alter %cr2 on #PF. Consequently, this function 2071 * should only be used as part of instruction emulation. 2072 * 2073 * Parameters: 2074 * exit: The VCPU this translation should be performed for (guest MMU settings 2075 * are gathered from this VCPU) 2076 * va: virtual address to translate 2077 * pa: pointer to paddr_t variable that will receive the translated physical 2078 * address. 'pa' is unchanged on error. 2079 * mode: one of PROT_READ, PROT_WRITE, PROT_EXEC indicating the mode in which 2080 * the address should be translated 2081 * 2082 * Return values: 2083 * 0: the address was successfully translated - 'pa' contains the physical 2084 * address currently mapped by 'va'. 2085 * EFAULT: the PTE for 'VA' is unmapped. A #PF will be injected in this case 2086 * and %cr2 set in the vcpu structure. 2087 * EINVAL: an error occurred reading paging table structures 2088 */ 2089 int 2090 translate_gva(struct vm_exit* exit, uint64_t va, uint64_t* pa, int mode) 2091 { 2092 int level, shift, pdidx; 2093 uint64_t pte, pt_paddr, pte_paddr, mask, low_mask, high_mask; 2094 uint64_t shift_width, pte_size; 2095 struct vcpu_reg_state *vrs; 2096 2097 vrs = &exit->vrs; 2098 2099 if (!pa) 2100 return (EINVAL); 2101 2102 if (!(vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PG)) { 2103 log_debug("%s: unpaged, va=pa=0x%llx", __func__, va); 2104 *pa = va; 2105 return (0); 2106 } 2107 2108 pt_paddr = vrs->vrs_crs[VCPU_REGS_CR3]; 2109 2110 log_debug("%s: guest %%cr0=0x%llx, %%cr3=0x%llx", __func__, 2111 vrs->vrs_crs[VCPU_REGS_CR0], vrs->vrs_crs[VCPU_REGS_CR3]); 2112 2113 if (vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PE) { 2114 if (vrs->vrs_crs[VCPU_REGS_CR4] & CR4_PAE) { 2115 pte_size = sizeof(uint64_t); 2116 shift_width = 9; 2117 2118 if (vrs->vrs_msrs[VCPU_REGS_EFER] & EFER_LMA) { 2119 /* 4 level paging */ 2120 level = 4; 2121 mask = L4_MASK; 2122 shift = L4_SHIFT; 2123 } else { 2124 /* 32 bit with PAE paging */ 2125 level = 3; 2126 mask = L3_MASK; 2127 shift = L3_SHIFT; 2128 } 2129 } else { 2130 /* 32 bit paging */ 2131 level = 2; 2132 shift_width = 10; 2133 mask = 0xFFC00000; 2134 shift = 22; 2135 pte_size = sizeof(uint32_t); 2136 } 2137 } else 2138 return (EINVAL); 2139 2140 /* XXX: Check for R bit in segment selector and set A bit */ 2141 2142 for (;level > 0; level--) { 2143 pdidx = (va & mask) >> shift; 2144 pte_paddr = (pt_paddr) + (pdidx * pte_size); 2145 2146 log_debug("%s: read pte level %d @ GPA 0x%llx", __func__, 2147 level, pte_paddr); 2148 if (read_mem(pte_paddr, &pte, pte_size)) { 2149 log_warn("%s: failed to read pte", __func__); 2150 return (EFAULT); 2151 } 2152 2153 log_debug("%s: PTE @ 0x%llx = 0x%llx", __func__, pte_paddr, 2154 pte); 2155 2156 /* XXX: Set CR2 */ 2157 if (!(pte & PG_V)) 2158 return (EFAULT); 2159 2160 /* XXX: Check for SMAP */ 2161 if ((mode == PROT_WRITE) && !(pte & PG_RW)) 2162 return (EPERM); 2163 2164 if ((exit->cpl > 0) && !(pte & PG_u)) 2165 return (EPERM); 2166 2167 pte = pte | PG_U; 2168 if (mode == PROT_WRITE) 2169 pte = pte | PG_M; 2170 if (write_mem(pte_paddr, &pte, pte_size)) { 2171 log_warn("%s: failed to write back flags to pte", 2172 __func__); 2173 return (EIO); 2174 } 2175 2176 /* XXX: EINVAL if in 32bit and PG_PS is 1 but CR4.PSE is 0 */ 2177 if (pte & PG_PS) 2178 break; 2179 2180 if (level > 1) { 2181 pt_paddr = pte & PG_FRAME; 2182 shift -= shift_width; 2183 mask = mask >> shift_width; 2184 } 2185 } 2186 2187 low_mask = (1 << shift) - 1; 2188 high_mask = (((uint64_t)1ULL << ((pte_size * 8) - 1)) - 1) ^ low_mask; 2189 *pa = (pte & high_mask) | (va & low_mask); 2190 2191 log_debug("%s: final GPA for GVA 0x%llx = 0x%llx\n", __func__, va, *pa); 2192 2193 return (0); 2194 } 2195 2196 /* 2197 * vm_pipe_init 2198 * 2199 * Initialize a vm_dev_pipe, setting up its file descriptors and its 2200 * event structure with the given callback. 2201 * 2202 * Parameters: 2203 * p: pointer to vm_dev_pipe struct to initizlize 2204 * cb: callback to use for READ events on the read end of the pipe 2205 */ 2206 void 2207 vm_pipe_init(struct vm_dev_pipe *p, void (*cb)(int, short, void *)) 2208 { 2209 int ret; 2210 int fds[2]; 2211 2212 memset(p, 0, sizeof(struct vm_dev_pipe)); 2213 2214 ret = pipe(fds); 2215 if (ret) 2216 fatal("failed to create vm_dev_pipe pipe"); 2217 2218 p->read = fds[0]; 2219 p->write = fds[1]; 2220 2221 event_set(&p->read_ev, p->read, EV_READ | EV_PERSIST, cb, NULL); 2222 } 2223 2224 /* 2225 * vm_pipe_send 2226 * 2227 * Send a message to an emulated device vie the provided vm_dev_pipe. 2228 * 2229 * Parameters: 2230 * p: pointer to initialized vm_dev_pipe 2231 * msg: message to send in the channel 2232 */ 2233 void 2234 vm_pipe_send(struct vm_dev_pipe *p, enum pipe_msg_type msg) 2235 { 2236 size_t n; 2237 n = write(p->write, &msg, sizeof(msg)); 2238 if (n != sizeof(msg)) 2239 fatal("failed to write to device pipe"); 2240 } 2241 2242 /* 2243 * vm_pipe_recv 2244 * 2245 * Receive a message for an emulated device via the provided vm_dev_pipe. 2246 * Returns the message value, otherwise will exit on failure. 2247 * 2248 * Parameters: 2249 * p: pointer to initialized vm_dev_pipe 2250 * 2251 * Return values: 2252 * a value of enum pipe_msg_type or fatal exit on read(2) error 2253 */ 2254 enum pipe_msg_type 2255 vm_pipe_recv(struct vm_dev_pipe *p) 2256 { 2257 size_t n; 2258 enum pipe_msg_type msg; 2259 n = read(p->read, &msg, sizeof(msg)); 2260 if (n != sizeof(msg)) 2261 fatal("failed to read from device pipe"); 2262 2263 return msg; 2264 } 2265