1 /* $OpenBSD: vm.c,v 1.58 2020/06/28 16:52:45 pd Exp $ */ 2 3 /* 4 * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/types.h> 20 #include <sys/ioctl.h> 21 #include <sys/queue.h> 22 #include <sys/wait.h> 23 #include <sys/uio.h> 24 #include <sys/socket.h> 25 #include <sys/time.h> 26 #include <sys/mman.h> 27 28 #include <dev/ic/i8253reg.h> 29 #include <dev/isa/isareg.h> 30 #include <dev/pci/pcireg.h> 31 32 #include <machine/param.h> 33 #include <machine/psl.h> 34 #include <machine/pte.h> 35 #include <machine/specialreg.h> 36 #include <machine/vmmvar.h> 37 38 #include <net/if.h> 39 40 #include <errno.h> 41 #include <event.h> 42 #include <fcntl.h> 43 #include <imsg.h> 44 #include <limits.h> 45 #include <poll.h> 46 #include <pthread.h> 47 #include <stddef.h> 48 #include <stdio.h> 49 #include <stdlib.h> 50 #include <string.h> 51 #include <unistd.h> 52 #include <util.h> 53 54 #include "vmd.h" 55 #include "vmm.h" 56 #include "loadfile.h" 57 #include "pci.h" 58 #include "virtio.h" 59 #include "proc.h" 60 #include "i8253.h" 61 #include "i8259.h" 62 #include "ns8250.h" 63 #include "mc146818.h" 64 #include "fw_cfg.h" 65 #include "atomicio.h" 66 67 io_fn_t ioports_map[MAX_PORTS]; 68 69 int run_vm(int, int[][VM_MAX_BASE_PER_DISK], int *, 70 struct vmop_create_params *, struct vcpu_reg_state *); 71 void vm_dispatch_vmm(int, short, void *); 72 void *event_thread(void *); 73 void *vcpu_run_loop(void *); 74 int vcpu_exit(struct vm_run_params *); 75 int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *); 76 void create_memory_map(struct vm_create_params *); 77 int alloc_guest_mem(struct vm_create_params *); 78 int vmm_create_vm(struct vm_create_params *); 79 void init_emulated_hw(struct vmop_create_params *, int, 80 int[][VM_MAX_BASE_PER_DISK], int *); 81 void restore_emulated_hw(struct vm_create_params *, int, int *, 82 int[][VM_MAX_BASE_PER_DISK],int); 83 void vcpu_exit_inout(struct vm_run_params *); 84 int vcpu_exit_eptviolation(struct vm_run_params *); 85 uint8_t vcpu_exit_pci(struct vm_run_params *); 86 int vcpu_pic_intr(uint32_t, uint32_t, uint8_t); 87 int loadfile_bios(FILE *, struct vcpu_reg_state *); 88 int send_vm(int, struct vm_create_params *); 89 int dump_send_header(int); 90 int dump_vmr(int , struct vm_mem_range *); 91 int dump_mem(int, struct vm_create_params *); 92 void restore_vmr(int, struct vm_mem_range *); 93 void restore_mem(int, struct vm_create_params *); 94 int restore_vm_params(int, struct vm_create_params *); 95 void pause_vm(struct vm_create_params *); 96 void unpause_vm(struct vm_create_params *); 97 98 int translate_gva(struct vm_exit*, uint64_t, uint64_t *, int); 99 100 static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t, 101 size_t); 102 103 int con_fd; 104 struct vmd_vm *current_vm; 105 106 extern struct vmd *env; 107 108 extern char *__progname; 109 110 pthread_mutex_t threadmutex; 111 pthread_cond_t threadcond; 112 113 pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM]; 114 pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM]; 115 pthread_barrier_t vm_pause_barrier; 116 pthread_cond_t vcpu_unpause_cond[VMM_MAX_VCPUS_PER_VM]; 117 pthread_mutex_t vcpu_unpause_mtx[VMM_MAX_VCPUS_PER_VM]; 118 uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM]; 119 uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM]; 120 121 /* 122 * Represents a standard register set for an OS to be booted 123 * as a flat 64 bit address space. 124 * 125 * NOT set here are: 126 * RIP 127 * RSP 128 * GDTR BASE 129 * 130 * Specific bootloaders should clone this structure and override 131 * those fields as needed. 132 * 133 * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on 134 * features of the CPU in use. 135 */ 136 static const struct vcpu_reg_state vcpu_init_flat64 = { 137 .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2, 138 .vrs_gprs[VCPU_REGS_RIP] = 0x0, 139 .vrs_gprs[VCPU_REGS_RSP] = 0x0, 140 .vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE | CR0_PG, 141 .vrs_crs[VCPU_REGS_CR3] = PML4_PAGE, 142 .vrs_crs[VCPU_REGS_CR4] = CR4_PAE | CR4_PSE, 143 .vrs_crs[VCPU_REGS_PDPTE0] = 0ULL, 144 .vrs_crs[VCPU_REGS_PDPTE1] = 0ULL, 145 .vrs_crs[VCPU_REGS_PDPTE2] = 0ULL, 146 .vrs_crs[VCPU_REGS_PDPTE3] = 0ULL, 147 .vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0}, 148 .vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 149 .vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 150 .vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 151 .vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 152 .vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 153 .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0}, 154 .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0}, 155 .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0}, 156 .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0}, 157 .vrs_msrs[VCPU_REGS_EFER] = EFER_LME | EFER_LMA, 158 .vrs_drs[VCPU_REGS_DR0] = 0x0, 159 .vrs_drs[VCPU_REGS_DR1] = 0x0, 160 .vrs_drs[VCPU_REGS_DR2] = 0x0, 161 .vrs_drs[VCPU_REGS_DR3] = 0x0, 162 .vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0, 163 .vrs_drs[VCPU_REGS_DR7] = 0x400, 164 .vrs_msrs[VCPU_REGS_STAR] = 0ULL, 165 .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL, 166 .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL, 167 .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL, 168 .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL, 169 .vrs_msrs[VCPU_REGS_MISC_ENABLE] = 0ULL, 170 .vrs_crs[VCPU_REGS_XCR0] = XCR0_X87 171 }; 172 173 /* 174 * Represents a standard register set for an BIOS to be booted 175 * as a flat 16 bit address space. 176 */ 177 static const struct vcpu_reg_state vcpu_init_flat16 = { 178 .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2, 179 .vrs_gprs[VCPU_REGS_RIP] = 0xFFF0, 180 .vrs_gprs[VCPU_REGS_RSP] = 0x0, 181 .vrs_crs[VCPU_REGS_CR0] = 0x60000010, 182 .vrs_crs[VCPU_REGS_CR3] = 0, 183 .vrs_sregs[VCPU_REGS_CS] = { 0xF000, 0xFFFF, 0x809F, 0xF0000}, 184 .vrs_sregs[VCPU_REGS_DS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 185 .vrs_sregs[VCPU_REGS_ES] = { 0x0, 0xFFFF, 0x8093, 0x0}, 186 .vrs_sregs[VCPU_REGS_FS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 187 .vrs_sregs[VCPU_REGS_GS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 188 .vrs_sregs[VCPU_REGS_SS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 189 .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0}, 190 .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0}, 191 .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0}, 192 .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0}, 193 .vrs_msrs[VCPU_REGS_EFER] = 0ULL, 194 .vrs_drs[VCPU_REGS_DR0] = 0x0, 195 .vrs_drs[VCPU_REGS_DR1] = 0x0, 196 .vrs_drs[VCPU_REGS_DR2] = 0x0, 197 .vrs_drs[VCPU_REGS_DR3] = 0x0, 198 .vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0, 199 .vrs_drs[VCPU_REGS_DR7] = 0x400, 200 .vrs_msrs[VCPU_REGS_STAR] = 0ULL, 201 .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL, 202 .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL, 203 .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL, 204 .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL, 205 .vrs_crs[VCPU_REGS_XCR0] = XCR0_X87 206 }; 207 208 /* 209 * loadfile_bios 210 * 211 * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image 212 * directly into memory. 213 * 214 * Parameters: 215 * fp: file of a kernel file to load 216 * (out) vrs: register state to set on init for this kernel 217 * 218 * Return values: 219 * 0 if successful 220 * various error codes returned from read(2) or loadelf functions 221 */ 222 int 223 loadfile_bios(FILE *fp, struct vcpu_reg_state *vrs) 224 { 225 off_t size, off; 226 227 /* Set up a "flat 16 bit" register state for BIOS */ 228 memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs)); 229 230 /* Get the size of the BIOS image and seek to the beginning */ 231 if (fseeko(fp, 0, SEEK_END) == -1 || (size = ftello(fp)) == -1 || 232 fseeko(fp, 0, SEEK_SET) == -1) 233 return (-1); 234 235 /* The BIOS image must end at 1M */ 236 if ((off = 1048576 - size) < 0) 237 return (-1); 238 239 /* Read BIOS image into memory */ 240 if (mread(fp, off, size) != (size_t)size) { 241 errno = EIO; 242 return (-1); 243 } 244 245 log_debug("%s: loaded BIOS image", __func__); 246 247 return (0); 248 } 249 250 /* 251 * start_vm 252 * 253 * After forking a new VM process, starts the new VM with the creation 254 * parameters supplied (in the incoming vm->vm_params field). This 255 * function performs a basic sanity check on the incoming parameters 256 * and then performs the following steps to complete the creation of the VM: 257 * 258 * 1. validates and create the new VM 259 * 2. opens the imsg control channel to the parent and drops more privilege 260 * 3. drops additional privleges by calling pledge(2) 261 * 4. loads the kernel from the disk image or file descriptor 262 * 5. runs the VM's VCPU loops. 263 * 264 * Parameters: 265 * vm: The VM data structure that is including the VM create parameters. 266 * fd: The imsg socket that is connected to the parent process. 267 * 268 * Return values: 269 * 0: success 270 * !0 : failure - typically an errno indicating the source of the failure 271 */ 272 int 273 start_vm(struct vmd_vm *vm, int fd) 274 { 275 struct vmop_create_params *vmc = &vm->vm_params; 276 struct vm_create_params *vcp = &vmc->vmc_params; 277 struct vcpu_reg_state vrs; 278 int nicfds[VMM_MAX_NICS_PER_VM]; 279 int ret; 280 FILE *fp; 281 struct vmboot_params vmboot; 282 size_t i; 283 struct vm_rwregs_params vrp; 284 285 /* Child */ 286 setproctitle("%s", vcp->vcp_name); 287 log_procinit(vcp->vcp_name); 288 289 if (!(vm->vm_state & VM_STATE_RECEIVED)) 290 create_memory_map(vcp); 291 292 ret = alloc_guest_mem(vcp); 293 294 if (ret) { 295 errno = ret; 296 fatal("could not allocate guest memory - exiting"); 297 } 298 299 ret = vmm_create_vm(vcp); 300 current_vm = vm; 301 302 /* send back the kernel-generated vm id (0 on error) */ 303 if (write(fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) != 304 sizeof(vcp->vcp_id)) 305 fatal("write vcp id"); 306 307 if (ret) { 308 errno = ret; 309 fatal("create vmm ioctl failed - exiting"); 310 } 311 312 /* 313 * pledge in the vm processes: 314 * stdio - for malloc and basic I/O including events. 315 * recvfd - for send/recv. 316 * vmm - for the vmm ioctls and operations. 317 */ 318 if (pledge("stdio vmm recvfd", NULL) == -1) 319 fatal("pledge"); 320 321 if (vm->vm_state & VM_STATE_RECEIVED) { 322 ret = read(vm->vm_receive_fd, &vrp, sizeof(vrp)); 323 if (ret != sizeof(vrp)) { 324 fatal("received incomplete vrp - exiting"); 325 } 326 vrs = vrp.vrwp_regs; 327 } else { 328 /* 329 * Set up default "flat 64 bit" register state - RIP, 330 * RSP, and GDT info will be set in bootloader 331 */ 332 memcpy(&vrs, &vcpu_init_flat64, sizeof(vrs)); 333 334 /* Find and open kernel image */ 335 if ((fp = vmboot_open(vm->vm_kernel, 336 vm->vm_disks[0], vmc->vmc_diskbases[0], 337 vmc->vmc_disktypes[0], &vmboot)) == NULL) 338 fatalx("failed to open kernel - exiting"); 339 340 /* Load kernel image */ 341 ret = loadfile_elf(fp, vcp, &vrs, 342 vmboot.vbp_bootdev, vmboot.vbp_howto, vmc->vmc_bootdevice); 343 344 /* 345 * Try BIOS as a fallback (only if it was provided as an image 346 * with vm->vm_kernel and not loaded from the disk) 347 */ 348 if (ret && errno == ENOEXEC && vm->vm_kernel != -1) 349 ret = loadfile_bios(fp, &vrs); 350 351 if (ret) 352 fatal("failed to load kernel or BIOS - exiting"); 353 354 vmboot_close(fp, &vmboot); 355 } 356 357 if (vm->vm_kernel != -1) 358 close(vm->vm_kernel); 359 360 con_fd = vm->vm_tty; 361 if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1) 362 fatal("failed to set nonblocking mode on console"); 363 364 for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) 365 nicfds[i] = vm->vm_ifs[i].vif_fd; 366 367 event_init(); 368 369 if (vm->vm_state & VM_STATE_RECEIVED) { 370 restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds, 371 vm->vm_disks, vm->vm_cdrom); 372 restore_mem(vm->vm_receive_fd, vcp); 373 if (restore_vm_params(vm->vm_receive_fd, vcp)) 374 fatal("restore vm params failed"); 375 unpause_vm(vcp); 376 } 377 378 if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1) 379 fatal("setup vm pipe"); 380 381 /* Execute the vcpu run loop(s) for this VM */ 382 ret = run_vm(vm->vm_cdrom, vm->vm_disks, nicfds, &vm->vm_params, &vrs); 383 384 /* Ensure that any in-flight data is written back */ 385 virtio_shutdown(vm); 386 387 return (ret); 388 } 389 390 /* 391 * vm_dispatch_vmm 392 * 393 * imsg callback for messages that are received from the vmm parent process. 394 */ 395 void 396 vm_dispatch_vmm(int fd, short event, void *arg) 397 { 398 struct vmd_vm *vm = arg; 399 struct vmop_result vmr; 400 struct imsgev *iev = &vm->vm_iev; 401 struct imsgbuf *ibuf = &iev->ibuf; 402 struct imsg imsg; 403 ssize_t n; 404 int verbose; 405 406 if (event & EV_READ) { 407 if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN) 408 fatal("%s: imsg_read", __func__); 409 if (n == 0) 410 _exit(0); 411 } 412 413 if (event & EV_WRITE) { 414 if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN) 415 fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd); 416 if (n == 0) 417 _exit(0); 418 } 419 420 for (;;) { 421 if ((n = imsg_get(ibuf, &imsg)) == -1) 422 fatal("%s: imsg_get", __func__); 423 if (n == 0) 424 break; 425 426 #if DEBUG > 1 427 log_debug("%s: got imsg %d from %s", 428 __func__, imsg.hdr.type, 429 vm->vm_params.vmc_params.vcp_name); 430 #endif 431 432 switch (imsg.hdr.type) { 433 case IMSG_CTL_VERBOSE: 434 IMSG_SIZE_CHECK(&imsg, &verbose); 435 memcpy(&verbose, imsg.data, sizeof(verbose)); 436 log_setverbose(verbose); 437 break; 438 case IMSG_VMDOP_VM_SHUTDOWN: 439 if (vmmci_ctl(VMMCI_SHUTDOWN) == -1) 440 _exit(0); 441 break; 442 case IMSG_VMDOP_VM_REBOOT: 443 if (vmmci_ctl(VMMCI_REBOOT) == -1) 444 _exit(0); 445 break; 446 case IMSG_VMDOP_PAUSE_VM: 447 vmr.vmr_result = 0; 448 vmr.vmr_id = vm->vm_vmid; 449 pause_vm(&vm->vm_params.vmc_params); 450 imsg_compose_event(&vm->vm_iev, 451 IMSG_VMDOP_PAUSE_VM_RESPONSE, 452 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 453 sizeof(vmr)); 454 break; 455 case IMSG_VMDOP_UNPAUSE_VM: 456 vmr.vmr_result = 0; 457 vmr.vmr_id = vm->vm_vmid; 458 unpause_vm(&vm->vm_params.vmc_params); 459 imsg_compose_event(&vm->vm_iev, 460 IMSG_VMDOP_UNPAUSE_VM_RESPONSE, 461 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 462 sizeof(vmr)); 463 break; 464 case IMSG_VMDOP_SEND_VM_REQUEST: 465 vmr.vmr_id = vm->vm_vmid; 466 vmr.vmr_result = send_vm(imsg.fd, 467 &vm->vm_params.vmc_params); 468 imsg_compose_event(&vm->vm_iev, 469 IMSG_VMDOP_SEND_VM_RESPONSE, 470 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 471 sizeof(vmr)); 472 if (!vmr.vmr_result) { 473 imsg_flush(¤t_vm->vm_iev.ibuf); 474 _exit(0); 475 } 476 break; 477 default: 478 fatalx("%s: got invalid imsg %d from %s", 479 __func__, imsg.hdr.type, 480 vm->vm_params.vmc_params.vcp_name); 481 } 482 imsg_free(&imsg); 483 } 484 imsg_event_add(iev); 485 } 486 487 /* 488 * vm_ctl 489 * 490 * Tell the vmm parent process to shutdown or reboot the VM and exit. 491 */ 492 __dead void 493 vm_shutdown(unsigned int cmd) 494 { 495 switch (cmd) { 496 case VMMCI_NONE: 497 case VMMCI_SHUTDOWN: 498 (void)imsg_compose_event(¤t_vm->vm_iev, 499 IMSG_VMDOP_VM_SHUTDOWN, 0, 0, -1, NULL, 0); 500 break; 501 case VMMCI_REBOOT: 502 (void)imsg_compose_event(¤t_vm->vm_iev, 503 IMSG_VMDOP_VM_REBOOT, 0, 0, -1, NULL, 0); 504 break; 505 default: 506 fatalx("invalid vm ctl command: %d", cmd); 507 } 508 imsg_flush(¤t_vm->vm_iev.ibuf); 509 510 _exit(0); 511 } 512 513 int 514 send_vm(int fd, struct vm_create_params *vcp) 515 { 516 struct vm_rwregs_params vrp; 517 struct vm_rwvmparams_params vpp; 518 struct vmop_create_params *vmc; 519 struct vm_terminate_params vtp; 520 unsigned int flags = 0; 521 unsigned int i; 522 int ret = 0; 523 size_t sz; 524 525 if (dump_send_header(fd)) { 526 log_info("%s: failed to send vm dump header", __func__); 527 goto err; 528 } 529 530 pause_vm(vcp); 531 532 vmc = calloc(1, sizeof(struct vmop_create_params)); 533 if (vmc == NULL) { 534 log_warn("%s: calloc error geting vmc", __func__); 535 ret = -1; 536 goto err; 537 } 538 539 flags |= VMOP_CREATE_MEMORY; 540 memcpy(&vmc->vmc_params, ¤t_vm->vm_params, sizeof(struct 541 vmop_create_params)); 542 vmc->vmc_flags = flags; 543 vrp.vrwp_vm_id = vcp->vcp_id; 544 vrp.vrwp_mask = VM_RWREGS_ALL; 545 vpp.vpp_mask = VM_RWVMPARAMS_ALL; 546 vpp.vpp_vm_id = vcp->vcp_id; 547 548 sz = atomicio(vwrite, fd, vmc,sizeof(struct vmop_create_params)); 549 if (sz != sizeof(struct vmop_create_params)) { 550 ret = -1; 551 goto err; 552 } 553 554 for (i = 0; i < vcp->vcp_ncpus; i++) { 555 vrp.vrwp_vcpu_id = i; 556 if ((ret = ioctl(env->vmd_fd, VMM_IOC_READREGS, &vrp))) { 557 log_warn("%s: readregs failed", __func__); 558 goto err; 559 } 560 561 sz = atomicio(vwrite, fd, &vrp, 562 sizeof(struct vm_rwregs_params)); 563 if (sz != sizeof(struct vm_rwregs_params)) { 564 log_warn("%s: dumping registers failed", __func__); 565 ret = -1; 566 goto err; 567 } 568 } 569 570 if ((ret = i8253_dump(fd))) 571 goto err; 572 if ((ret = i8259_dump(fd))) 573 goto err; 574 if ((ret = ns8250_dump(fd))) 575 goto err; 576 if ((ret = mc146818_dump(fd))) 577 goto err; 578 if ((ret = fw_cfg_dump(fd))) 579 goto err; 580 if ((ret = pci_dump(fd))) 581 goto err; 582 if ((ret = virtio_dump(fd))) 583 goto err; 584 if ((ret = dump_mem(fd, vcp))) 585 goto err; 586 587 for (i = 0; i < vcp->vcp_ncpus; i++) { 588 vpp.vpp_vcpu_id = i; 589 if ((ret = ioctl(env->vmd_fd, VMM_IOC_READVMPARAMS, &vpp))) { 590 log_warn("%s: readvmparams failed", __func__); 591 goto err; 592 } 593 594 sz = atomicio(vwrite, fd, &vpp, 595 sizeof(struct vm_rwvmparams_params)); 596 if (sz != sizeof(struct vm_rwvmparams_params)) { 597 log_warn("%s: dumping vm params failed", __func__); 598 ret = -1; 599 goto err; 600 } 601 } 602 603 vtp.vtp_vm_id = vcp->vcp_id; 604 if (ioctl(env->vmd_fd, VMM_IOC_TERM, &vtp) == -1) { 605 log_warnx("%s: term IOC error: %d, %d", __func__, 606 errno, ENOENT); 607 } 608 err: 609 close(fd); 610 if (ret) 611 unpause_vm(vcp); 612 return ret; 613 } 614 615 int 616 dump_send_header(int fd) { 617 struct vm_dump_header vmh; 618 int i; 619 620 memcpy(&vmh.vmh_signature, VM_DUMP_SIGNATURE, 621 sizeof(vmh.vmh_signature)); 622 623 vmh.vmh_cpuids[0].code = 0x00; 624 vmh.vmh_cpuids[0].leaf = 0x00; 625 626 vmh.vmh_cpuids[1].code = 0x01; 627 vmh.vmh_cpuids[1].leaf = 0x00; 628 629 vmh.vmh_cpuids[2].code = 0x07; 630 vmh.vmh_cpuids[2].leaf = 0x00; 631 632 vmh.vmh_cpuids[3].code = 0x0d; 633 vmh.vmh_cpuids[3].leaf = 0x00; 634 635 vmh.vmh_cpuids[4].code = 0x80000001; 636 vmh.vmh_cpuids[4].leaf = 0x00; 637 638 vmh.vmh_version = VM_DUMP_VERSION; 639 640 for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) { 641 CPUID_LEAF(vmh.vmh_cpuids[i].code, 642 vmh.vmh_cpuids[i].leaf, 643 vmh.vmh_cpuids[i].a, 644 vmh.vmh_cpuids[i].b, 645 vmh.vmh_cpuids[i].c, 646 vmh.vmh_cpuids[i].d); 647 } 648 649 if (atomicio(vwrite, fd, &vmh, sizeof(vmh)) != sizeof(vmh)) 650 return (-1); 651 652 return (0); 653 } 654 655 int 656 dump_mem(int fd, struct vm_create_params *vcp) 657 { 658 unsigned int i; 659 int ret; 660 struct vm_mem_range *vmr; 661 662 for (i = 0; i < vcp->vcp_nmemranges; i++) { 663 vmr = &vcp->vcp_memranges[i]; 664 ret = dump_vmr(fd, vmr); 665 if (ret) 666 return ret; 667 } 668 return (0); 669 } 670 671 int 672 restore_vm_params(int fd, struct vm_create_params *vcp) { 673 unsigned int i; 674 struct vm_rwvmparams_params vpp; 675 676 for (i = 0; i < vcp->vcp_ncpus; i++) { 677 if (atomicio(read, fd, &vpp, sizeof(vpp)) != sizeof(vpp)) { 678 log_warn("%s: error restoring vm params", __func__); 679 return (-1); 680 } 681 vpp.vpp_vm_id = vcp->vcp_id; 682 vpp.vpp_vcpu_id = i; 683 if (ioctl(env->vmd_fd, VMM_IOC_WRITEVMPARAMS, &vpp) < 0) { 684 log_debug("%s: writing vm params failed", __func__); 685 return (-1); 686 } 687 } 688 return (0); 689 } 690 691 void 692 restore_mem(int fd, struct vm_create_params *vcp) 693 { 694 unsigned int i; 695 struct vm_mem_range *vmr; 696 697 for (i = 0; i < vcp->vcp_nmemranges; i++) { 698 vmr = &vcp->vcp_memranges[i]; 699 restore_vmr(fd, vmr); 700 } 701 } 702 703 int 704 dump_vmr(int fd, struct vm_mem_range *vmr) 705 { 706 size_t rem = vmr->vmr_size, read=0; 707 char buf[PAGE_SIZE]; 708 709 while (rem > 0) { 710 if (read_mem(vmr->vmr_gpa + read, buf, PAGE_SIZE)) { 711 log_warn("failed to read vmr"); 712 return (-1); 713 } 714 if (atomicio(vwrite, fd, buf, sizeof(buf)) != sizeof(buf)) { 715 log_warn("failed to dump vmr"); 716 return (-1); 717 } 718 rem = rem - PAGE_SIZE; 719 read = read + PAGE_SIZE; 720 } 721 return (0); 722 } 723 724 void 725 restore_vmr(int fd, struct vm_mem_range *vmr) 726 { 727 size_t rem = vmr->vmr_size, wrote=0; 728 char buf[PAGE_SIZE]; 729 730 while (rem > 0) { 731 if (atomicio(read, fd, buf, sizeof(buf)) != sizeof(buf)) 732 fatal("failed to restore vmr"); 733 if (write_mem(vmr->vmr_gpa + wrote, buf, PAGE_SIZE)) 734 fatal("failed to write vmr"); 735 rem = rem - PAGE_SIZE; 736 wrote = wrote + PAGE_SIZE; 737 } 738 } 739 740 void 741 pause_vm(struct vm_create_params *vcp) 742 { 743 unsigned int n; 744 int ret; 745 if (current_vm->vm_state & VM_STATE_PAUSED) 746 return; 747 748 current_vm->vm_state |= VM_STATE_PAUSED; 749 750 ret = pthread_barrier_init(&vm_pause_barrier, NULL, vcp->vcp_ncpus + 1); 751 if (ret) { 752 log_warnx("%s: cannot initialize pause barrier (%d)", 753 __progname, ret); 754 return; 755 } 756 757 for (n = 0; n < vcp->vcp_ncpus; n++) { 758 ret = pthread_cond_broadcast(&vcpu_run_cond[n]); 759 if (ret) { 760 log_warnx("%s: can't broadcast vcpu run cond (%d)", 761 __func__, (int)ret); 762 return; 763 } 764 } 765 ret = pthread_barrier_wait(&vm_pause_barrier); 766 if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) { 767 log_warnx("%s: could not wait on pause barrier (%d)", 768 __func__, (int)ret); 769 return; 770 } 771 772 ret = pthread_barrier_destroy(&vm_pause_barrier); 773 if (ret) { 774 log_warnx("%s: could not destroy pause barrier (%d)", 775 __progname, ret); 776 return; 777 } 778 779 i8253_stop(); 780 mc146818_stop(); 781 ns8250_stop(); 782 virtio_stop(vcp); 783 } 784 785 void 786 unpause_vm(struct vm_create_params *vcp) 787 { 788 unsigned int n; 789 int ret; 790 if (!(current_vm->vm_state & VM_STATE_PAUSED)) 791 return; 792 793 current_vm->vm_state &= ~VM_STATE_PAUSED; 794 for (n = 0; n < vcp->vcp_ncpus; n++) { 795 ret = pthread_cond_broadcast(&vcpu_unpause_cond[n]); 796 if (ret) { 797 log_warnx("%s: can't broadcast vcpu unpause cond (%d)", 798 __func__, (int)ret); 799 return; 800 } 801 } 802 803 i8253_start(); 804 mc146818_start(); 805 ns8250_start(); 806 virtio_start(vcp); 807 } 808 809 /* 810 * vcpu_reset 811 * 812 * Requests vmm(4) to reset the VCPUs in the indicated VM to 813 * the register state provided 814 * 815 * Parameters 816 * vmid: VM ID to reset 817 * vcpu_id: VCPU ID to reset 818 * vrs: the register state to initialize 819 * 820 * Return values: 821 * 0: success 822 * !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not 823 * valid) 824 */ 825 int 826 vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs) 827 { 828 struct vm_resetcpu_params vrp; 829 830 memset(&vrp, 0, sizeof(vrp)); 831 vrp.vrp_vm_id = vmid; 832 vrp.vrp_vcpu_id = vcpu_id; 833 memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state)); 834 835 log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid); 836 837 if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) == -1) 838 return (errno); 839 840 return (0); 841 } 842 843 /* 844 * create_memory_map 845 * 846 * Sets up the guest physical memory ranges that the VM can access. 847 * 848 * Parameters: 849 * vcp: VM create parameters describing the VM whose memory map 850 * is being created 851 * 852 * Return values: 853 * nothing 854 */ 855 void 856 create_memory_map(struct vm_create_params *vcp) 857 { 858 size_t len, mem_bytes, mem_mb; 859 860 mem_mb = vcp->vcp_memranges[0].vmr_size; 861 vcp->vcp_nmemranges = 0; 862 if (mem_mb < 1 || mem_mb > VMM_MAX_VM_MEM_SIZE) 863 return; 864 865 mem_bytes = mem_mb * 1024 * 1024; 866 867 /* First memory region: 0 - LOWMEM_KB (DOS low mem) */ 868 len = LOWMEM_KB * 1024; 869 vcp->vcp_memranges[0].vmr_gpa = 0x0; 870 vcp->vcp_memranges[0].vmr_size = len; 871 mem_bytes -= len; 872 873 /* 874 * Second memory region: LOWMEM_KB - 1MB. 875 * 876 * N.B. - Normally ROMs or parts of video RAM are mapped here. 877 * We have to add this region, because some systems 878 * unconditionally write to 0xb8000 (VGA RAM), and 879 * we need to make sure that vmm(4) permits accesses 880 * to it. So allocate guest memory for it. 881 */ 882 len = 0x100000 - LOWMEM_KB * 1024; 883 vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024; 884 vcp->vcp_memranges[1].vmr_size = len; 885 mem_bytes -= len; 886 887 /* Make sure that we do not place physical memory into MMIO ranges. */ 888 if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - 0x100000) 889 len = VMM_PCI_MMIO_BAR_BASE - 0x100000; 890 else 891 len = mem_bytes; 892 893 /* Third memory region: 1MB - (1MB + len) */ 894 vcp->vcp_memranges[2].vmr_gpa = 0x100000; 895 vcp->vcp_memranges[2].vmr_size = len; 896 mem_bytes -= len; 897 898 if (mem_bytes > 0) { 899 /* Fourth memory region for the remaining memory (if any) */ 900 vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1; 901 vcp->vcp_memranges[3].vmr_size = mem_bytes; 902 vcp->vcp_nmemranges = 4; 903 } else 904 vcp->vcp_nmemranges = 3; 905 } 906 907 /* 908 * alloc_guest_mem 909 * 910 * Allocates memory for the guest. 911 * Instead of doing a single allocation with one mmap(), we allocate memory 912 * separately for every range for the following reasons: 913 * - ASLR for the individual ranges 914 * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to 915 * map the single mmap'd userspace memory to the individual guest physical 916 * memory ranges, the underlying amap of the single mmap'd range would have 917 * to allocate per-page reference counters. The reason is that the 918 * individual guest physical ranges would reference the single mmap'd region 919 * only partially. However, if every guest physical range has its own 920 * corresponding mmap'd userspace allocation, there are no partial 921 * references: every guest physical range fully references an mmap'd 922 * range => no per-page reference counters have to be allocated. 923 * 924 * Return values: 925 * 0: success 926 * !0: failure - errno indicating the source of the failure 927 */ 928 int 929 alloc_guest_mem(struct vm_create_params *vcp) 930 { 931 void *p; 932 int ret; 933 size_t i, j; 934 struct vm_mem_range *vmr; 935 936 for (i = 0; i < vcp->vcp_nmemranges; i++) { 937 vmr = &vcp->vcp_memranges[i]; 938 p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE, 939 MAP_PRIVATE | MAP_ANON, -1, 0); 940 if (p == MAP_FAILED) { 941 ret = errno; 942 for (j = 0; j < i; j++) { 943 vmr = &vcp->vcp_memranges[j]; 944 munmap((void *)vmr->vmr_va, vmr->vmr_size); 945 } 946 947 return (ret); 948 } 949 950 vmr->vmr_va = (vaddr_t)p; 951 } 952 953 return (0); 954 } 955 956 /* 957 * vmm_create_vm 958 * 959 * Requests vmm(4) to create a new VM using the supplied creation 960 * parameters. This operation results in the creation of the in-kernel 961 * structures for the VM, but does not start the VM's vcpu(s). 962 * 963 * Parameters: 964 * vcp: vm_create_params struct containing the VM's desired creation 965 * configuration 966 * 967 * Return values: 968 * 0: success 969 * !0 : ioctl to vmm(4) failed 970 */ 971 int 972 vmm_create_vm(struct vm_create_params *vcp) 973 { 974 /* Sanity check arguments */ 975 if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) 976 return (EINVAL); 977 978 if (vcp->vcp_nmemranges == 0 || 979 vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES) 980 return (EINVAL); 981 982 if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) 983 return (EINVAL); 984 985 if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) 986 return (EINVAL); 987 988 if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) == -1) 989 return (errno); 990 991 return (0); 992 } 993 994 /* 995 * init_emulated_hw 996 * 997 * Initializes the userspace hardware emulation 998 */ 999 void 1000 init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom, 1001 int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps) 1002 { 1003 struct vm_create_params *vcp = &vmc->vmc_params; 1004 int i; 1005 uint64_t memlo, memhi; 1006 1007 /* Calculate memory size for NVRAM registers */ 1008 memlo = memhi = 0; 1009 if (vcp->vcp_nmemranges > 2) 1010 memlo = vcp->vcp_memranges[2].vmr_size - 15 * 0x100000; 1011 1012 if (vcp->vcp_nmemranges > 3) 1013 memhi = vcp->vcp_memranges[3].vmr_size; 1014 1015 /* Reset the IO port map */ 1016 memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS); 1017 1018 /* Init i8253 PIT */ 1019 i8253_init(vcp->vcp_id); 1020 ioports_map[TIMER_CTRL] = vcpu_exit_i8253; 1021 ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253; 1022 ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253; 1023 ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253; 1024 ioports_map[PCKBC_AUX] = vcpu_exit_i8253_misc; 1025 1026 /* Init mc146818 RTC */ 1027 mc146818_init(vcp->vcp_id, memlo, memhi); 1028 ioports_map[IO_RTC] = vcpu_exit_mc146818; 1029 ioports_map[IO_RTC + 1] = vcpu_exit_mc146818; 1030 1031 /* Init master and slave PICs */ 1032 i8259_init(); 1033 ioports_map[IO_ICU1] = vcpu_exit_i8259; 1034 ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259; 1035 ioports_map[IO_ICU2] = vcpu_exit_i8259; 1036 ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259; 1037 ioports_map[ELCR0] = vcpu_exit_elcr; 1038 ioports_map[ELCR1] = vcpu_exit_elcr; 1039 1040 /* Init ns8250 UART */ 1041 ns8250_init(con_fd, vcp->vcp_id); 1042 for (i = COM1_DATA; i <= COM1_SCR; i++) 1043 ioports_map[i] = vcpu_exit_com; 1044 1045 /* Init QEMU fw_cfg interface */ 1046 fw_cfg_init(vmc); 1047 ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg; 1048 ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg; 1049 ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma; 1050 ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma; 1051 1052 /* Initialize PCI */ 1053 for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++) 1054 ioports_map[i] = vcpu_exit_pci; 1055 1056 ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci; 1057 ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci; 1058 ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci; 1059 ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci; 1060 ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci; 1061 pci_init(); 1062 1063 /* Initialize virtio devices */ 1064 virtio_init(current_vm, child_cdrom, child_disks, child_taps); 1065 } 1066 /* 1067 * restore_emulated_hw 1068 * 1069 * Restores the userspace hardware emulation from fd 1070 */ 1071 void 1072 restore_emulated_hw(struct vm_create_params *vcp, int fd, 1073 int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom) 1074 { 1075 /* struct vm_create_params *vcp = &vmc->vmc_params; */ 1076 int i; 1077 memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS); 1078 1079 /* Init i8253 PIT */ 1080 i8253_restore(fd, vcp->vcp_id); 1081 ioports_map[TIMER_CTRL] = vcpu_exit_i8253; 1082 ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253; 1083 ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253; 1084 ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253; 1085 1086 /* Init master and slave PICs */ 1087 i8259_restore(fd); 1088 ioports_map[IO_ICU1] = vcpu_exit_i8259; 1089 ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259; 1090 ioports_map[IO_ICU2] = vcpu_exit_i8259; 1091 ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259; 1092 1093 /* Init ns8250 UART */ 1094 ns8250_restore(fd, con_fd, vcp->vcp_id); 1095 for (i = COM1_DATA; i <= COM1_SCR; i++) 1096 ioports_map[i] = vcpu_exit_com; 1097 1098 /* Init mc146818 RTC */ 1099 mc146818_restore(fd, vcp->vcp_id); 1100 ioports_map[IO_RTC] = vcpu_exit_mc146818; 1101 ioports_map[IO_RTC + 1] = vcpu_exit_mc146818; 1102 1103 /* Init QEMU fw_cfg interface */ 1104 fw_cfg_restore(fd); 1105 ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg; 1106 ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg; 1107 ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma; 1108 ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma; 1109 1110 /* Initialize PCI */ 1111 for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++) 1112 ioports_map[i] = vcpu_exit_pci; 1113 1114 ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci; 1115 ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci; 1116 ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci; 1117 ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci; 1118 ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci; 1119 pci_restore(fd); 1120 virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps); 1121 } 1122 1123 /* 1124 * run_vm 1125 * 1126 * Runs the VM whose creation parameters are specified in vcp 1127 * 1128 * Parameters: 1129 * child_cdrom: previously-opened child ISO disk file descriptor 1130 * child_disks: previously-opened child VM disk file file descriptors 1131 * child_taps: previously-opened child tap file descriptors 1132 * vmc: vmop_create_params struct containing the VM's desired creation 1133 * configuration 1134 * vrs: VCPU register state to initialize 1135 * 1136 * Return values: 1137 * 0: the VM exited normally 1138 * !0 : the VM exited abnormally or failed to start 1139 */ 1140 int 1141 run_vm(int child_cdrom, int child_disks[][VM_MAX_BASE_PER_DISK], 1142 int *child_taps, struct vmop_create_params *vmc, 1143 struct vcpu_reg_state *vrs) 1144 { 1145 struct vm_create_params *vcp = &vmc->vmc_params; 1146 struct vm_rwregs_params vregsp; 1147 uint8_t evdone = 0; 1148 size_t i; 1149 int ret; 1150 pthread_t *tid, evtid; 1151 struct vm_run_params **vrp; 1152 void *exit_status; 1153 1154 if (vcp == NULL) 1155 return (EINVAL); 1156 1157 if (child_cdrom == -1 && strlen(vcp->vcp_cdrom)) 1158 return (EINVAL); 1159 1160 if (child_disks == NULL && vcp->vcp_ndisks != 0) 1161 return (EINVAL); 1162 1163 if (child_taps == NULL && vcp->vcp_nnics != 0) 1164 return (EINVAL); 1165 1166 if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) 1167 return (EINVAL); 1168 1169 if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) 1170 return (EINVAL); 1171 1172 if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) 1173 return (EINVAL); 1174 1175 if (vcp->vcp_nmemranges == 0 || 1176 vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES) 1177 return (EINVAL); 1178 1179 tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t)); 1180 vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *)); 1181 if (tid == NULL || vrp == NULL) { 1182 log_warn("%s: memory allocation error - exiting.", 1183 __progname); 1184 return (ENOMEM); 1185 } 1186 1187 log_debug("%s: initializing hardware for vm %s", __func__, 1188 vcp->vcp_name); 1189 1190 if (!(current_vm->vm_state & VM_STATE_RECEIVED)) 1191 init_emulated_hw(vmc, child_cdrom, child_disks, child_taps); 1192 1193 ret = pthread_mutex_init(&threadmutex, NULL); 1194 if (ret) { 1195 log_warn("%s: could not initialize thread state mutex", 1196 __func__); 1197 return (ret); 1198 } 1199 ret = pthread_cond_init(&threadcond, NULL); 1200 if (ret) { 1201 log_warn("%s: could not initialize thread state " 1202 "condition variable", __func__); 1203 return (ret); 1204 } 1205 1206 mutex_lock(&threadmutex); 1207 1208 log_debug("%s: starting vcpu threads for vm %s", __func__, 1209 vcp->vcp_name); 1210 1211 /* 1212 * Create and launch one thread for each VCPU. These threads may 1213 * migrate between PCPUs over time; the need to reload CPU state 1214 * in such situations is detected and performed by vmm(4) in the 1215 * kernel. 1216 */ 1217 for (i = 0 ; i < vcp->vcp_ncpus; i++) { 1218 vrp[i] = malloc(sizeof(struct vm_run_params)); 1219 if (vrp[i] == NULL) { 1220 log_warn("%s: memory allocation error - " 1221 "exiting.", __progname); 1222 /* caller will exit, so skip freeing */ 1223 return (ENOMEM); 1224 } 1225 vrp[i]->vrp_exit = malloc(sizeof(struct vm_exit)); 1226 if (vrp[i]->vrp_exit == NULL) { 1227 log_warn("%s: memory allocation error - " 1228 "exiting.", __progname); 1229 /* caller will exit, so skip freeing */ 1230 return (ENOMEM); 1231 } 1232 vrp[i]->vrp_vm_id = vcp->vcp_id; 1233 vrp[i]->vrp_vcpu_id = i; 1234 1235 if (vcpu_reset(vcp->vcp_id, i, vrs)) { 1236 log_warnx("%s: cannot reset VCPU %zu - exiting.", 1237 __progname, i); 1238 return (EIO); 1239 } 1240 1241 /* once more because reset_cpu changes regs */ 1242 if (current_vm->vm_state & VM_STATE_RECEIVED) { 1243 vregsp.vrwp_vm_id = vcp->vcp_id; 1244 vregsp.vrwp_vcpu_id = i; 1245 vregsp.vrwp_regs = *vrs; 1246 vregsp.vrwp_mask = VM_RWREGS_ALL; 1247 if ((ret = ioctl(env->vmd_fd, VMM_IOC_WRITEREGS, 1248 &vregsp)) == -1) { 1249 log_warn("%s: writeregs failed", __func__); 1250 return (ret); 1251 } 1252 } 1253 1254 ret = pthread_cond_init(&vcpu_run_cond[i], NULL); 1255 if (ret) { 1256 log_warnx("%s: cannot initialize cond var (%d)", 1257 __progname, ret); 1258 return (ret); 1259 } 1260 1261 ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL); 1262 if (ret) { 1263 log_warnx("%s: cannot initialize mtx (%d)", 1264 __progname, ret); 1265 return (ret); 1266 } 1267 1268 ret = pthread_cond_init(&vcpu_unpause_cond[i], NULL); 1269 if (ret) { 1270 log_warnx("%s: cannot initialize unpause var (%d)", 1271 __progname, ret); 1272 return (ret); 1273 } 1274 1275 ret = pthread_mutex_init(&vcpu_unpause_mtx[i], NULL); 1276 if (ret) { 1277 log_warnx("%s: cannot initialize unpause mtx (%d)", 1278 __progname, ret); 1279 return (ret); 1280 } 1281 1282 vcpu_hlt[i] = 0; 1283 1284 /* Start each VCPU run thread at vcpu_run_loop */ 1285 ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]); 1286 if (ret) { 1287 /* caller will _exit after this return */ 1288 ret = errno; 1289 log_warn("%s: could not create vcpu thread %zu", 1290 __func__, i); 1291 return (ret); 1292 } 1293 } 1294 1295 log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name); 1296 ret = pthread_create(&evtid, NULL, event_thread, &evdone); 1297 if (ret) { 1298 errno = ret; 1299 log_warn("%s: could not create event thread", __func__); 1300 return (ret); 1301 } 1302 1303 for (;;) { 1304 ret = pthread_cond_wait(&threadcond, &threadmutex); 1305 if (ret) { 1306 log_warn("%s: waiting on thread state condition " 1307 "variable failed", __func__); 1308 return (ret); 1309 } 1310 1311 /* 1312 * Did a VCPU thread exit with an error? => return the first one 1313 */ 1314 for (i = 0; i < vcp->vcp_ncpus; i++) { 1315 if (vcpu_done[i] == 0) 1316 continue; 1317 1318 if (pthread_join(tid[i], &exit_status)) { 1319 log_warn("%s: failed to join thread %zd - " 1320 "exiting", __progname, i); 1321 return (EIO); 1322 } 1323 1324 ret = (intptr_t)exit_status; 1325 } 1326 1327 /* Did the event thread exit? => return with an error */ 1328 if (evdone) { 1329 if (pthread_join(evtid, &exit_status)) { 1330 log_warn("%s: failed to join event thread - " 1331 "exiting", __progname); 1332 return (EIO); 1333 } 1334 1335 log_warnx("%s: vm %d event thread exited " 1336 "unexpectedly", __progname, vcp->vcp_id); 1337 return (EIO); 1338 } 1339 1340 /* Did all VCPU threads exit successfully? => return */ 1341 for (i = 0; i < vcp->vcp_ncpus; i++) { 1342 if (vcpu_done[i] == 0) 1343 break; 1344 } 1345 if (i == vcp->vcp_ncpus) 1346 return (ret); 1347 1348 /* Some more threads to wait for, start over */ 1349 } 1350 1351 return (ret); 1352 } 1353 1354 void * 1355 event_thread(void *arg) 1356 { 1357 uint8_t *donep = arg; 1358 intptr_t ret; 1359 1360 ret = event_dispatch(); 1361 1362 mutex_lock(&threadmutex); 1363 *donep = 1; 1364 pthread_cond_signal(&threadcond); 1365 mutex_unlock(&threadmutex); 1366 1367 return (void *)ret; 1368 } 1369 1370 /* 1371 * vcpu_run_loop 1372 * 1373 * Runs a single VCPU until vmm(4) requires help handling an exit, 1374 * or the VM terminates. 1375 * 1376 * Parameters: 1377 * arg: vcpu_run_params for the VCPU being run by this thread 1378 * 1379 * Return values: 1380 * NULL: the VCPU shutdown properly 1381 * !NULL: error processing VCPU run, or the VCPU shutdown abnormally 1382 */ 1383 void * 1384 vcpu_run_loop(void *arg) 1385 { 1386 struct vm_run_params *vrp = (struct vm_run_params *)arg; 1387 intptr_t ret = 0; 1388 int irq; 1389 uint32_t n; 1390 1391 vrp->vrp_continue = 0; 1392 n = vrp->vrp_vcpu_id; 1393 1394 for (;;) { 1395 ret = pthread_mutex_lock(&vcpu_run_mtx[n]); 1396 1397 if (ret) { 1398 log_warnx("%s: can't lock vcpu run mtx (%d)", 1399 __func__, (int)ret); 1400 return ((void *)ret); 1401 } 1402 1403 /* If we are halted and need to pause, pause */ 1404 if (vcpu_hlt[n] && (current_vm->vm_state & VM_STATE_PAUSED)) { 1405 ret = pthread_barrier_wait(&vm_pause_barrier); 1406 if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) { 1407 log_warnx("%s: could not wait on pause barrier (%d)", 1408 __func__, (int)ret); 1409 return ((void *)ret); 1410 } 1411 1412 ret = pthread_mutex_lock(&vcpu_unpause_mtx[n]); 1413 if (ret) { 1414 log_warnx("%s: can't lock vcpu unpause mtx (%d)", 1415 __func__, (int)ret); 1416 return ((void *)ret); 1417 } 1418 1419 ret = pthread_cond_wait(&vcpu_unpause_cond[n], 1420 &vcpu_unpause_mtx[n]); 1421 if (ret) { 1422 log_warnx( 1423 "%s: can't wait on unpause cond (%d)", 1424 __func__, (int)ret); 1425 break; 1426 } 1427 ret = pthread_mutex_unlock(&vcpu_unpause_mtx[n]); 1428 if (ret) { 1429 log_warnx("%s: can't unlock unpause mtx (%d)", 1430 __func__, (int)ret); 1431 break; 1432 } 1433 } 1434 1435 /* If we are halted and not paused, wait */ 1436 if (vcpu_hlt[n]) { 1437 ret = pthread_cond_wait(&vcpu_run_cond[n], 1438 &vcpu_run_mtx[n]); 1439 1440 if (ret) { 1441 log_warnx( 1442 "%s: can't wait on cond (%d)", 1443 __func__, (int)ret); 1444 (void)pthread_mutex_unlock( 1445 &vcpu_run_mtx[n]); 1446 break; 1447 } 1448 } 1449 1450 ret = pthread_mutex_unlock(&vcpu_run_mtx[n]); 1451 1452 if (ret) { 1453 log_warnx("%s: can't unlock mutex on cond (%d)", 1454 __func__, (int)ret); 1455 break; 1456 } 1457 1458 if (vrp->vrp_irqready && i8259_is_pending()) { 1459 irq = i8259_ack(); 1460 vrp->vrp_irq = irq; 1461 } else 1462 vrp->vrp_irq = 0xFFFF; 1463 1464 /* Still more pending? */ 1465 if (i8259_is_pending()) { 1466 /* XXX can probably avoid ioctls here by providing intr in vrp */ 1467 if (vcpu_pic_intr(vrp->vrp_vm_id, 1468 vrp->vrp_vcpu_id, 1)) { 1469 fatal("can't set INTR"); 1470 } 1471 } else { 1472 if (vcpu_pic_intr(vrp->vrp_vm_id, 1473 vrp->vrp_vcpu_id, 0)) { 1474 fatal("can't clear INTR"); 1475 } 1476 } 1477 1478 if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) == -1) { 1479 /* If run ioctl failed, exit */ 1480 ret = errno; 1481 log_warn("%s: vm %d / vcpu %d run ioctl failed", 1482 __func__, vrp->vrp_vm_id, n); 1483 break; 1484 } 1485 1486 /* If the VM is terminating, exit normally */ 1487 if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) { 1488 ret = (intptr_t)NULL; 1489 break; 1490 } 1491 1492 if (vrp->vrp_exit_reason != VM_EXIT_NONE) { 1493 /* 1494 * vmm(4) needs help handling an exit, handle in 1495 * vcpu_exit. 1496 */ 1497 ret = vcpu_exit(vrp); 1498 if (ret) 1499 break; 1500 } 1501 } 1502 1503 mutex_lock(&threadmutex); 1504 vcpu_done[n] = 1; 1505 pthread_cond_signal(&threadcond); 1506 mutex_unlock(&threadmutex); 1507 1508 return ((void *)ret); 1509 } 1510 1511 int 1512 vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr) 1513 { 1514 struct vm_intr_params vip; 1515 1516 memset(&vip, 0, sizeof(vip)); 1517 1518 vip.vip_vm_id = vm_id; 1519 vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */ 1520 vip.vip_intr = intr; 1521 1522 if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) == -1) 1523 return (errno); 1524 1525 return (0); 1526 } 1527 1528 /* 1529 * vcpu_exit_pci 1530 * 1531 * Handle all I/O to the emulated PCI subsystem. 1532 * 1533 * Parameters: 1534 * vrp: vcpu run paramters containing guest state for this exit 1535 * 1536 * Return value: 1537 * Interrupt to inject to the guest VM, or 0xFF if no interrupt should 1538 * be injected. 1539 */ 1540 uint8_t 1541 vcpu_exit_pci(struct vm_run_params *vrp) 1542 { 1543 struct vm_exit *vei = vrp->vrp_exit; 1544 uint8_t intr; 1545 1546 intr = 0xFF; 1547 1548 switch (vei->vei.vei_port) { 1549 case PCI_MODE1_ADDRESS_REG: 1550 pci_handle_address_reg(vrp); 1551 break; 1552 case PCI_MODE1_DATA_REG: 1553 case PCI_MODE1_DATA_REG + 1: 1554 case PCI_MODE1_DATA_REG + 2: 1555 case PCI_MODE1_DATA_REG + 3: 1556 pci_handle_data_reg(vrp); 1557 break; 1558 case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END: 1559 intr = pci_handle_io(vrp); 1560 break; 1561 default: 1562 log_warnx("%s: unknown PCI register 0x%llx", 1563 __progname, (uint64_t)vei->vei.vei_port); 1564 break; 1565 } 1566 1567 return (intr); 1568 } 1569 1570 /* 1571 * vcpu_exit_inout 1572 * 1573 * Handle all I/O exits that need to be emulated in vmd. This includes the 1574 * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device. 1575 * 1576 * Parameters: 1577 * vrp: vcpu run parameters containing guest state for this exit 1578 */ 1579 void 1580 vcpu_exit_inout(struct vm_run_params *vrp) 1581 { 1582 struct vm_exit *vei = vrp->vrp_exit; 1583 uint8_t intr = 0xFF; 1584 1585 if (ioports_map[vei->vei.vei_port] != NULL) 1586 intr = ioports_map[vei->vei.vei_port](vrp); 1587 else if (vei->vei.vei_dir == VEI_DIR_IN) 1588 set_return_data(vei, 0xFFFFFFFF); 1589 1590 if (intr != 0xFF) 1591 vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr); 1592 } 1593 1594 /* 1595 * vcpu_exit_eptviolation 1596 * 1597 * handle an EPT Violation 1598 * 1599 * 1600 * Parameters: 1601 * vrp: vcpu run parameters containing guest state for this exit 1602 * 1603 * Return values: 1604 * 0: no action required 1605 * EAGAIN: a protection fault occured, kill the vm. 1606 */ 1607 int 1608 vcpu_exit_eptviolation(struct vm_run_params *vrp) 1609 { 1610 struct vm_exit *ve = vrp->vrp_exit; 1611 /* 1612 * vmd may be exiting to vmd to handle a pending interrupt 1613 * but last exit type may have bee VMX_EXIT_EPT_VIOLATION, 1614 * check the fault_type to ensure we really are processing 1615 * a VMX_EXIT_EPT_VIOLATION. 1616 */ 1617 if (ve->vee.vee_fault_type == VEE_FAULT_PROTECT) { 1618 log_debug("%s: EPT Violation: rip=0x%llx", 1619 __progname, vrp->vrp_exit->vrs.vrs_gprs[VCPU_REGS_RIP]); 1620 return (EAGAIN); 1621 } 1622 1623 return (0); 1624 } 1625 1626 /* 1627 * vcpu_exit 1628 * 1629 * Handle a vcpu exit. This function is called when it is determined that 1630 * vmm(4) requires the assistance of vmd to support a particular guest 1631 * exit type (eg, accessing an I/O port or device). Guest state is contained 1632 * in 'vrp', and will be resent to vmm(4) on exit completion. 1633 * 1634 * Upon conclusion of handling the exit, the function determines if any 1635 * interrupts should be injected into the guest, and asserts the proper 1636 * IRQ line whose interrupt should be vectored. 1637 * 1638 * Parameters: 1639 * vrp: vcpu run parameters containing guest state for this exit 1640 * 1641 * Return values: 1642 * 0: the exit was handled successfully 1643 * 1: an error occurred (eg, unknown exit reason passed in 'vrp') 1644 */ 1645 int 1646 vcpu_exit(struct vm_run_params *vrp) 1647 { 1648 int ret; 1649 1650 switch (vrp->vrp_exit_reason) { 1651 case VMX_EXIT_INT_WINDOW: 1652 case SVM_VMEXIT_VINTR: 1653 case VMX_EXIT_CPUID: 1654 case VMX_EXIT_EXTINT: 1655 case SVM_VMEXIT_INTR: 1656 case SVM_VMEXIT_NPF: 1657 case SVM_VMEXIT_MSR: 1658 case SVM_VMEXIT_CPUID: 1659 /* 1660 * We may be exiting to vmd to handle a pending interrupt but 1661 * at the same time the last exit type may have been one of 1662 * these. In this case, there's nothing extra to be done 1663 * here (and falling through to the default case below results 1664 * in more vmd log spam). 1665 */ 1666 break; 1667 case VMX_EXIT_EPT_VIOLATION: 1668 ret = vcpu_exit_eptviolation(vrp); 1669 if (ret) 1670 return (ret); 1671 1672 break; 1673 case VMX_EXIT_IO: 1674 case SVM_VMEXIT_IOIO: 1675 vcpu_exit_inout(vrp); 1676 break; 1677 case VMX_EXIT_HLT: 1678 case SVM_VMEXIT_HLT: 1679 ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]); 1680 if (ret) { 1681 log_warnx("%s: can't lock vcpu mutex (%d)", 1682 __func__, ret); 1683 return (ret); 1684 } 1685 vcpu_hlt[vrp->vrp_vcpu_id] = 1; 1686 ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]); 1687 if (ret) { 1688 log_warnx("%s: can't unlock vcpu mutex (%d)", 1689 __func__, ret); 1690 return (ret); 1691 } 1692 break; 1693 case VMX_EXIT_TRIPLE_FAULT: 1694 case SVM_VMEXIT_SHUTDOWN: 1695 /* reset VM */ 1696 return (EAGAIN); 1697 default: 1698 log_debug("%s: unknown exit reason 0x%x", 1699 __progname, vrp->vrp_exit_reason); 1700 } 1701 1702 /* Process any pending traffic */ 1703 vionet_process_rx(vrp->vrp_vm_id); 1704 1705 vrp->vrp_continue = 1; 1706 1707 return (0); 1708 } 1709 1710 /* 1711 * find_gpa_range 1712 * 1713 * Search for a contiguous guest physical mem range. 1714 * 1715 * Parameters: 1716 * vcp: VM create parameters that contain the memory map to search in 1717 * gpa: the starting guest physical address 1718 * len: the length of the memory range 1719 * 1720 * Return values: 1721 * NULL: on failure if there is no memory range as described by the parameters 1722 * Pointer to vm_mem_range that contains the start of the range otherwise. 1723 */ 1724 static struct vm_mem_range * 1725 find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len) 1726 { 1727 size_t i, n; 1728 struct vm_mem_range *vmr; 1729 1730 /* Find the first vm_mem_range that contains gpa */ 1731 for (i = 0; i < vcp->vcp_nmemranges; i++) { 1732 vmr = &vcp->vcp_memranges[i]; 1733 if (vmr->vmr_gpa + vmr->vmr_size >= gpa) 1734 break; 1735 } 1736 1737 /* No range found. */ 1738 if (i == vcp->vcp_nmemranges) 1739 return (NULL); 1740 1741 /* 1742 * vmr may cover the range [gpa, gpa + len) only partly. Make 1743 * sure that the following vm_mem_ranges are contiguous and 1744 * cover the rest. 1745 */ 1746 n = vmr->vmr_size - (gpa - vmr->vmr_gpa); 1747 if (len < n) 1748 len = 0; 1749 else 1750 len -= n; 1751 gpa = vmr->vmr_gpa + vmr->vmr_size; 1752 for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) { 1753 vmr = &vcp->vcp_memranges[i]; 1754 if (gpa != vmr->vmr_gpa) 1755 return (NULL); 1756 if (len <= vmr->vmr_size) 1757 len = 0; 1758 else 1759 len -= vmr->vmr_size; 1760 1761 gpa = vmr->vmr_gpa + vmr->vmr_size; 1762 } 1763 1764 if (len != 0) 1765 return (NULL); 1766 1767 return (vmr); 1768 } 1769 1770 void * 1771 vaddr_mem(paddr_t gpa, size_t len) 1772 { 1773 struct vm_create_params *vcp = ¤t_vm->vm_params.vmc_params; 1774 size_t i; 1775 struct vm_mem_range *vmr; 1776 paddr_t gpend = gpa + len; 1777 1778 /* Find the first vm_mem_range that contains gpa */ 1779 for (i = 0; i < vcp->vcp_nmemranges; i++) { 1780 vmr = &vcp->vcp_memranges[i]; 1781 if (gpa < vmr->vmr_gpa) 1782 continue; 1783 1784 if (gpend >= vmr->vmr_gpa + vmr->vmr_size) 1785 continue; 1786 1787 return ((char *)vmr->vmr_va + (gpa - vmr->vmr_gpa)); 1788 } 1789 1790 return (NULL); 1791 } 1792 1793 /* 1794 * write_mem 1795 * 1796 * Copies data from 'buf' into the guest VM's memory at paddr 'dst'. 1797 * 1798 * Parameters: 1799 * dst: the destination paddr_t in the guest VM 1800 * buf: data to copy (or NULL to zero the data) 1801 * len: number of bytes to copy 1802 * 1803 * Return values: 1804 * 0: success 1805 * EINVAL: if the guest physical memory range [dst, dst + len) does not 1806 * exist in the guest. 1807 */ 1808 int 1809 write_mem(paddr_t dst, const void *buf, size_t len) 1810 { 1811 const char *from = buf; 1812 char *to; 1813 size_t n, off; 1814 struct vm_mem_range *vmr; 1815 1816 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, dst, len); 1817 if (vmr == NULL) { 1818 errno = EINVAL; 1819 log_warn("%s: failed - invalid memory range dst = 0x%lx, " 1820 "len = 0x%zx", __func__, dst, len); 1821 return (EINVAL); 1822 } 1823 1824 off = dst - vmr->vmr_gpa; 1825 while (len != 0) { 1826 n = vmr->vmr_size - off; 1827 if (len < n) 1828 n = len; 1829 1830 to = (char *)vmr->vmr_va + off; 1831 if (buf == NULL) 1832 memset(to, 0, n); 1833 else { 1834 memcpy(to, from, n); 1835 from += n; 1836 } 1837 len -= n; 1838 off = 0; 1839 vmr++; 1840 } 1841 1842 return (0); 1843 } 1844 1845 /* 1846 * read_mem 1847 * 1848 * Reads memory at guest paddr 'src' into 'buf'. 1849 * 1850 * Parameters: 1851 * src: the source paddr_t in the guest VM to read from. 1852 * buf: destination (local) buffer 1853 * len: number of bytes to read 1854 * 1855 * Return values: 1856 * 0: success 1857 * EINVAL: if the guest physical memory range [dst, dst + len) does not 1858 * exist in the guest. 1859 */ 1860 int 1861 read_mem(paddr_t src, void *buf, size_t len) 1862 { 1863 char *from, *to = buf; 1864 size_t n, off; 1865 struct vm_mem_range *vmr; 1866 1867 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, src, len); 1868 if (vmr == NULL) { 1869 errno = EINVAL; 1870 log_warn("%s: failed - invalid memory range src = 0x%lx, " 1871 "len = 0x%zx", __func__, src, len); 1872 return (EINVAL); 1873 } 1874 1875 off = src - vmr->vmr_gpa; 1876 while (len != 0) { 1877 n = vmr->vmr_size - off; 1878 if (len < n) 1879 n = len; 1880 1881 from = (char *)vmr->vmr_va + off; 1882 memcpy(to, from, n); 1883 1884 to += n; 1885 len -= n; 1886 off = 0; 1887 vmr++; 1888 } 1889 1890 return (0); 1891 } 1892 1893 int 1894 iovec_mem(paddr_t src, size_t len, struct iovec *iov, int iovcnt) 1895 { 1896 size_t n, off; 1897 struct vm_mem_range *vmr; 1898 int niov = 0; 1899 1900 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, src, len); 1901 if (vmr == NULL) { 1902 errno = EINVAL; 1903 return (-1); 1904 } 1905 1906 off = src - vmr->vmr_gpa; 1907 while (len > 0) { 1908 if (niov == iovcnt) { 1909 errno = ENOMEM; 1910 return (-1); 1911 } 1912 1913 n = vmr->vmr_size - off; 1914 if (len < n) 1915 n = len; 1916 1917 iov[niov].iov_base = (char *)vmr->vmr_va + off; 1918 iov[niov].iov_len = n; 1919 1920 niov++; 1921 1922 len -= n; 1923 off = 0; 1924 vmr++; 1925 } 1926 1927 return (niov); 1928 } 1929 1930 /* 1931 * vcpu_assert_pic_irq 1932 * 1933 * Injects the specified IRQ on the supplied vcpu/vm 1934 * 1935 * Parameters: 1936 * vm_id: VM ID to inject to 1937 * vcpu_id: VCPU ID to inject to 1938 * irq: IRQ to inject 1939 */ 1940 void 1941 vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) 1942 { 1943 int ret; 1944 1945 i8259_assert_irq(irq); 1946 1947 if (i8259_is_pending()) { 1948 if (vcpu_pic_intr(vm_id, vcpu_id, 1)) 1949 fatalx("%s: can't assert INTR", __func__); 1950 1951 ret = pthread_mutex_lock(&vcpu_run_mtx[vcpu_id]); 1952 if (ret) 1953 fatalx("%s: can't lock vcpu mtx (%d)", __func__, ret); 1954 1955 vcpu_hlt[vcpu_id] = 0; 1956 ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]); 1957 if (ret) 1958 fatalx("%s: can't signal (%d)", __func__, ret); 1959 ret = pthread_mutex_unlock(&vcpu_run_mtx[vcpu_id]); 1960 if (ret) 1961 fatalx("%s: can't unlock vcpu mtx (%d)", __func__, ret); 1962 } 1963 } 1964 1965 /* 1966 * vcpu_deassert_pic_irq 1967 * 1968 * Clears the specified IRQ on the supplied vcpu/vm 1969 * 1970 * Parameters: 1971 * vm_id: VM ID to clear in 1972 * vcpu_id: VCPU ID to clear in 1973 * irq: IRQ to clear 1974 */ 1975 void 1976 vcpu_deassert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) 1977 { 1978 i8259_deassert_irq(irq); 1979 1980 if (!i8259_is_pending()) { 1981 if (vcpu_pic_intr(vm_id, vcpu_id, 0)) 1982 fatalx("%s: can't deassert INTR for vm_id %d, " 1983 "vcpu_id %d", __func__, vm_id, vcpu_id); 1984 } 1985 } 1986 1987 /* 1988 * fd_hasdata 1989 * 1990 * Determines if data can be read from a file descriptor. 1991 * 1992 * Parameters: 1993 * fd: the fd to check 1994 * 1995 * Return values: 1996 * 1 if data can be read from an fd, or 0 otherwise. 1997 */ 1998 int 1999 fd_hasdata(int fd) 2000 { 2001 struct pollfd pfd[1]; 2002 int nready, hasdata = 0; 2003 2004 pfd[0].fd = fd; 2005 pfd[0].events = POLLIN; 2006 nready = poll(pfd, 1, 0); 2007 if (nready == -1) 2008 log_warn("checking file descriptor for data failed"); 2009 else if (nready == 1 && pfd[0].revents & POLLIN) 2010 hasdata = 1; 2011 return (hasdata); 2012 } 2013 2014 /* 2015 * mutex_lock 2016 * 2017 * Wrapper function for pthread_mutex_lock that does error checking and that 2018 * exits on failure 2019 */ 2020 void 2021 mutex_lock(pthread_mutex_t *m) 2022 { 2023 int ret; 2024 2025 ret = pthread_mutex_lock(m); 2026 if (ret) { 2027 errno = ret; 2028 fatal("could not acquire mutex"); 2029 } 2030 } 2031 2032 /* 2033 * mutex_unlock 2034 * 2035 * Wrapper function for pthread_mutex_unlock that does error checking and that 2036 * exits on failure 2037 */ 2038 void 2039 mutex_unlock(pthread_mutex_t *m) 2040 { 2041 int ret; 2042 2043 ret = pthread_mutex_unlock(m); 2044 if (ret) { 2045 errno = ret; 2046 fatal("could not release mutex"); 2047 } 2048 } 2049 2050 /* 2051 * set_return_data 2052 * 2053 * Utility function for manipulating register data in vm exit info structs. This 2054 * function ensures that the data is copied to the vei->vei.vei_data field with 2055 * the proper size for the operation being performed. 2056 * 2057 * Parameters: 2058 * vei: exit information 2059 * data: return data 2060 */ 2061 void 2062 set_return_data(struct vm_exit *vei, uint32_t data) 2063 { 2064 switch (vei->vei.vei_size) { 2065 case 1: 2066 vei->vei.vei_data &= ~0xFF; 2067 vei->vei.vei_data |= (uint8_t)data; 2068 break; 2069 case 2: 2070 vei->vei.vei_data &= ~0xFFFF; 2071 vei->vei.vei_data |= (uint16_t)data; 2072 break; 2073 case 4: 2074 vei->vei.vei_data = data; 2075 break; 2076 } 2077 } 2078 2079 /* 2080 * get_input_data 2081 * 2082 * Utility function for manipulating register data in vm exit info 2083 * structs. This function ensures that the data is copied from the 2084 * vei->vei.vei_data field with the proper size for the operation being 2085 * performed. 2086 * 2087 * Parameters: 2088 * vei: exit information 2089 * data: location to store the result 2090 */ 2091 void 2092 get_input_data(struct vm_exit *vei, uint32_t *data) 2093 { 2094 switch (vei->vei.vei_size) { 2095 case 1: 2096 *data &= 0xFFFFFF00; 2097 *data |= (uint8_t)vei->vei.vei_data; 2098 break; 2099 case 2: 2100 *data &= 0xFFFF0000; 2101 *data |= (uint16_t)vei->vei.vei_data; 2102 break; 2103 case 4: 2104 *data = vei->vei.vei_data; 2105 break; 2106 default: 2107 log_warnx("%s: invalid i/o size %d", __func__, 2108 vei->vei.vei_size); 2109 } 2110 2111 } 2112 2113 /* 2114 * translate_gva 2115 * 2116 * Translates a guest virtual address to a guest physical address by walking 2117 * the currently active page table (if needed). 2118 * 2119 * Note - this function can possibly alter the supplied VCPU state. 2120 * Specifically, it may inject exceptions depending on the current VCPU 2121 * configuration, and may alter %cr2 on #PF. Consequently, this function 2122 * should only be used as part of instruction emulation. 2123 * 2124 * Parameters: 2125 * exit: The VCPU this translation should be performed for (guest MMU settings 2126 * are gathered from this VCPU) 2127 * va: virtual address to translate 2128 * pa: pointer to paddr_t variable that will receive the translated physical 2129 * address. 'pa' is unchanged on error. 2130 * mode: one of PROT_READ, PROT_WRITE, PROT_EXEC indicating the mode in which 2131 * the address should be translated 2132 * 2133 * Return values: 2134 * 0: the address was successfully translated - 'pa' contains the physical 2135 * address currently mapped by 'va'. 2136 * EFAULT: the PTE for 'VA' is unmapped. A #PF will be injected in this case 2137 * and %cr2 set in the vcpu structure. 2138 * EINVAL: an error occurred reading paging table structures 2139 */ 2140 int 2141 translate_gva(struct vm_exit* exit, uint64_t va, uint64_t* pa, int mode) 2142 { 2143 int level, shift, pdidx; 2144 uint64_t pte, pt_paddr, pte_paddr, mask, low_mask, high_mask; 2145 uint64_t shift_width, pte_size; 2146 struct vcpu_reg_state *vrs; 2147 2148 vrs = &exit->vrs; 2149 2150 if (!pa) 2151 return (EINVAL); 2152 2153 if (!(vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PG)) { 2154 log_debug("%s: unpaged, va=pa=0x%llx", __func__, va); 2155 *pa = va; 2156 return (0); 2157 } 2158 2159 pt_paddr = vrs->vrs_crs[VCPU_REGS_CR3]; 2160 2161 log_debug("%s: guest %%cr0=0x%llx, %%cr3=0x%llx", __func__, 2162 vrs->vrs_crs[VCPU_REGS_CR0], vrs->vrs_crs[VCPU_REGS_CR3]); 2163 2164 if (vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PE) { 2165 if (vrs->vrs_crs[VCPU_REGS_CR4] & CR4_PAE) { 2166 pte_size = sizeof(uint64_t); 2167 shift_width = 9; 2168 2169 if (vrs->vrs_msrs[VCPU_REGS_EFER] & EFER_LMA) { 2170 /* 4 level paging */ 2171 level = 4; 2172 mask = L4_MASK; 2173 shift = L4_SHIFT; 2174 } else { 2175 /* 32 bit with PAE paging */ 2176 level = 3; 2177 mask = L3_MASK; 2178 shift = L3_SHIFT; 2179 } 2180 } else { 2181 /* 32 bit paging */ 2182 level = 2; 2183 shift_width = 10; 2184 mask = 0xFFC00000; 2185 shift = 22; 2186 pte_size = sizeof(uint32_t); 2187 } 2188 } else 2189 return (EINVAL); 2190 2191 /* XXX: Check for R bit in segment selector and set A bit */ 2192 2193 for (;level > 0; level--) { 2194 pdidx = (va & mask) >> shift; 2195 pte_paddr = (pt_paddr) + (pdidx * pte_size); 2196 2197 log_debug("%s: read pte level %d @ GPA 0x%llx", __func__, 2198 level, pte_paddr); 2199 if (read_mem(pte_paddr, &pte, pte_size)) { 2200 log_warn("%s: failed to read pte", __func__); 2201 return (EFAULT); 2202 } 2203 2204 log_debug("%s: PTE @ 0x%llx = 0x%llx", __func__, pte_paddr, 2205 pte); 2206 2207 /* XXX: Set CR2 */ 2208 if (!(pte & PG_V)) 2209 return (EFAULT); 2210 2211 /* XXX: Check for SMAP */ 2212 if ((mode == PROT_WRITE) && !(pte & PG_RW)) 2213 return (EPERM); 2214 2215 if ((exit->cpl > 0) && !(pte & PG_u)) 2216 return (EPERM); 2217 2218 pte = pte | PG_U; 2219 if (mode == PROT_WRITE) 2220 pte = pte | PG_M; 2221 if (write_mem(pte_paddr, &pte, pte_size)) { 2222 log_warn("%s: failed to write back flags to pte", 2223 __func__); 2224 return (EIO); 2225 } 2226 2227 /* XXX: EINVAL if in 32bit and PG_PS is 1 but CR4.PSE is 0 */ 2228 if (pte & PG_PS) 2229 break; 2230 2231 if (level > 1) { 2232 pt_paddr = pte & PG_FRAME; 2233 shift -= shift_width; 2234 mask = mask >> shift_width; 2235 } 2236 } 2237 2238 low_mask = (1 << shift) - 1; 2239 high_mask = (((uint64_t)1ULL << ((pte_size * 8) - 1)) - 1) ^ low_mask; 2240 *pa = (pte & high_mask) | (va & low_mask); 2241 2242 log_debug("%s: final GPA for GVA 0x%llx = 0x%llx\n", __func__, va, *pa); 2243 2244 return (0); 2245 } 2246 2247 /* 2248 * vm_pipe_init 2249 * 2250 * Initialize a vm_dev_pipe, setting up its file descriptors and its 2251 * event structure with the given callback. 2252 * 2253 * Parameters: 2254 * p: pointer to vm_dev_pipe struct to initizlize 2255 * cb: callback to use for READ events on the read end of the pipe 2256 */ 2257 void 2258 vm_pipe_init(struct vm_dev_pipe *p, void (*cb)(int, short, void *)) 2259 { 2260 int ret; 2261 int fds[2]; 2262 2263 memset(p, 0, sizeof(struct vm_dev_pipe)); 2264 2265 ret = pipe(fds); 2266 if (ret) 2267 fatal("failed to create vm_dev_pipe pipe"); 2268 2269 p->read = fds[0]; 2270 p->write = fds[1]; 2271 2272 event_set(&p->read_ev, p->read, EV_READ | EV_PERSIST, cb, NULL); 2273 } 2274 2275 /* 2276 * vm_pipe_send 2277 * 2278 * Send a message to an emulated device vie the provided vm_dev_pipe. 2279 * 2280 * Parameters: 2281 * p: pointer to initialized vm_dev_pipe 2282 * msg: message to send in the channel 2283 */ 2284 void 2285 vm_pipe_send(struct vm_dev_pipe *p, enum pipe_msg_type msg) 2286 { 2287 size_t n; 2288 n = write(p->write, &msg, sizeof(msg)); 2289 if (n != sizeof(msg)) 2290 fatal("failed to write to device pipe"); 2291 } 2292 2293 /* 2294 * vm_pipe_recv 2295 * 2296 * Receive a message for an emulated device via the provided vm_dev_pipe. 2297 * Returns the message value, otherwise will exit on failure. 2298 * 2299 * Parameters: 2300 * p: pointer to initialized vm_dev_pipe 2301 * 2302 * Return values: 2303 * a value of enum pipe_msg_type or fatal exit on read(2) error 2304 */ 2305 enum pipe_msg_type 2306 vm_pipe_recv(struct vm_dev_pipe *p) 2307 { 2308 size_t n; 2309 enum pipe_msg_type msg; 2310 n = read(p->read, &msg, sizeof(msg)); 2311 if (n != sizeof(msg)) 2312 fatal("failed to read from device pipe"); 2313 2314 return msg; 2315 } 2316