1 /* $OpenBSD: vm.c,v 1.54 2019/12/11 06:45:16 pd Exp $ */ 2 3 /* 4 * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/types.h> 20 #include <sys/ioctl.h> 21 #include <sys/queue.h> 22 #include <sys/wait.h> 23 #include <sys/uio.h> 24 #include <sys/socket.h> 25 #include <sys/time.h> 26 #include <sys/mman.h> 27 28 #include <dev/ic/i8253reg.h> 29 #include <dev/isa/isareg.h> 30 #include <dev/pci/pcireg.h> 31 32 #include <machine/param.h> 33 #include <machine/psl.h> 34 #include <machine/pte.h> 35 #include <machine/specialreg.h> 36 #include <machine/vmmvar.h> 37 38 #include <net/if.h> 39 40 #include <errno.h> 41 #include <event.h> 42 #include <fcntl.h> 43 #include <imsg.h> 44 #include <limits.h> 45 #include <poll.h> 46 #include <pthread.h> 47 #include <stddef.h> 48 #include <stdio.h> 49 #include <stdlib.h> 50 #include <string.h> 51 #include <unistd.h> 52 #include <util.h> 53 54 #include "vmd.h" 55 #include "vmm.h" 56 #include "loadfile.h" 57 #include "pci.h" 58 #include "virtio.h" 59 #include "proc.h" 60 #include "i8253.h" 61 #include "i8259.h" 62 #include "ns8250.h" 63 #include "mc146818.h" 64 #include "fw_cfg.h" 65 #include "atomicio.h" 66 67 io_fn_t ioports_map[MAX_PORTS]; 68 69 int run_vm(int, int[][VM_MAX_BASE_PER_DISK], int *, 70 struct vmop_create_params *, struct vcpu_reg_state *); 71 void vm_dispatch_vmm(int, short, void *); 72 void *event_thread(void *); 73 void *vcpu_run_loop(void *); 74 int vcpu_exit(struct vm_run_params *); 75 int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *); 76 void create_memory_map(struct vm_create_params *); 77 int alloc_guest_mem(struct vm_create_params *); 78 int vmm_create_vm(struct vm_create_params *); 79 void init_emulated_hw(struct vmop_create_params *, int, 80 int[][VM_MAX_BASE_PER_DISK], int *); 81 void restore_emulated_hw(struct vm_create_params *, int, int *, 82 int[][VM_MAX_BASE_PER_DISK],int); 83 void vcpu_exit_inout(struct vm_run_params *); 84 uint8_t vcpu_exit_pci(struct vm_run_params *); 85 int vcpu_pic_intr(uint32_t, uint32_t, uint8_t); 86 int loadfile_bios(FILE *, struct vcpu_reg_state *); 87 int send_vm(int, struct vm_create_params *); 88 int dump_send_header(int); 89 int dump_vmr(int , struct vm_mem_range *); 90 int dump_mem(int, struct vm_create_params *); 91 void restore_vmr(int, struct vm_mem_range *); 92 void restore_mem(int, struct vm_create_params *); 93 int restore_vm_params(int, struct vm_create_params *); 94 void pause_vm(struct vm_create_params *); 95 void unpause_vm(struct vm_create_params *); 96 97 int translate_gva(struct vm_exit*, uint64_t, uint64_t *, int); 98 99 static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t, 100 size_t); 101 102 int con_fd; 103 struct vmd_vm *current_vm; 104 105 extern struct vmd *env; 106 107 extern char *__progname; 108 109 pthread_mutex_t threadmutex; 110 pthread_cond_t threadcond; 111 112 pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM]; 113 pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM]; 114 pthread_cond_t vcpu_pause_cond[VMM_MAX_VCPUS_PER_VM]; 115 pthread_mutex_t vcpu_pause_mtx[VMM_MAX_VCPUS_PER_VM]; 116 pthread_cond_t vcpu_unpause_cond[VMM_MAX_VCPUS_PER_VM]; 117 pthread_mutex_t vcpu_unpause_mtx[VMM_MAX_VCPUS_PER_VM]; 118 uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM]; 119 uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM]; 120 121 /* 122 * Represents a standard register set for an OS to be booted 123 * as a flat 64 bit address space. 124 * 125 * NOT set here are: 126 * RIP 127 * RSP 128 * GDTR BASE 129 * 130 * Specific bootloaders should clone this structure and override 131 * those fields as needed. 132 * 133 * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on 134 * features of the CPU in use. 135 */ 136 static const struct vcpu_reg_state vcpu_init_flat64 = { 137 .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2, 138 .vrs_gprs[VCPU_REGS_RIP] = 0x0, 139 .vrs_gprs[VCPU_REGS_RSP] = 0x0, 140 .vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE | CR0_PG, 141 .vrs_crs[VCPU_REGS_CR3] = PML4_PAGE, 142 .vrs_crs[VCPU_REGS_CR4] = CR4_PAE | CR4_PSE, 143 .vrs_crs[VCPU_REGS_PDPTE0] = 0ULL, 144 .vrs_crs[VCPU_REGS_PDPTE1] = 0ULL, 145 .vrs_crs[VCPU_REGS_PDPTE2] = 0ULL, 146 .vrs_crs[VCPU_REGS_PDPTE3] = 0ULL, 147 .vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0}, 148 .vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 149 .vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 150 .vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 151 .vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 152 .vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 153 .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0}, 154 .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0}, 155 .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0}, 156 .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0}, 157 .vrs_msrs[VCPU_REGS_EFER] = EFER_LME | EFER_LMA, 158 .vrs_drs[VCPU_REGS_DR0] = 0x0, 159 .vrs_drs[VCPU_REGS_DR1] = 0x0, 160 .vrs_drs[VCPU_REGS_DR2] = 0x0, 161 .vrs_drs[VCPU_REGS_DR3] = 0x0, 162 .vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0, 163 .vrs_drs[VCPU_REGS_DR7] = 0x400, 164 .vrs_msrs[VCPU_REGS_STAR] = 0ULL, 165 .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL, 166 .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL, 167 .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL, 168 .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL, 169 .vrs_msrs[VCPU_REGS_MISC_ENABLE] = 0ULL, 170 .vrs_crs[VCPU_REGS_XCR0] = XCR0_X87 171 }; 172 173 /* 174 * Represents a standard register set for an BIOS to be booted 175 * as a flat 16 bit address space. 176 */ 177 static const struct vcpu_reg_state vcpu_init_flat16 = { 178 .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2, 179 .vrs_gprs[VCPU_REGS_RIP] = 0xFFF0, 180 .vrs_gprs[VCPU_REGS_RSP] = 0x0, 181 .vrs_crs[VCPU_REGS_CR0] = 0x60000010, 182 .vrs_crs[VCPU_REGS_CR3] = 0, 183 .vrs_sregs[VCPU_REGS_CS] = { 0xF000, 0xFFFF, 0x809F, 0xF0000}, 184 .vrs_sregs[VCPU_REGS_DS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 185 .vrs_sregs[VCPU_REGS_ES] = { 0x0, 0xFFFF, 0x8093, 0x0}, 186 .vrs_sregs[VCPU_REGS_FS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 187 .vrs_sregs[VCPU_REGS_GS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 188 .vrs_sregs[VCPU_REGS_SS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 189 .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0}, 190 .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0}, 191 .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0}, 192 .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0}, 193 .vrs_msrs[VCPU_REGS_EFER] = 0ULL, 194 .vrs_drs[VCPU_REGS_DR0] = 0x0, 195 .vrs_drs[VCPU_REGS_DR1] = 0x0, 196 .vrs_drs[VCPU_REGS_DR2] = 0x0, 197 .vrs_drs[VCPU_REGS_DR3] = 0x0, 198 .vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0, 199 .vrs_drs[VCPU_REGS_DR7] = 0x400, 200 .vrs_msrs[VCPU_REGS_STAR] = 0ULL, 201 .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL, 202 .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL, 203 .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL, 204 .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL, 205 .vrs_crs[VCPU_REGS_XCR0] = XCR0_X87 206 }; 207 208 /* 209 * loadfile_bios 210 * 211 * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image 212 * directly into memory. 213 * 214 * Parameters: 215 * fp: file of a kernel file to load 216 * (out) vrs: register state to set on init for this kernel 217 * 218 * Return values: 219 * 0 if successful 220 * various error codes returned from read(2) or loadelf functions 221 */ 222 int 223 loadfile_bios(FILE *fp, struct vcpu_reg_state *vrs) 224 { 225 off_t size, off; 226 227 /* Set up a "flat 16 bit" register state for BIOS */ 228 memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs)); 229 230 /* Get the size of the BIOS image and seek to the beginning */ 231 if (fseeko(fp, 0, SEEK_END) == -1 || (size = ftello(fp)) == -1 || 232 fseeko(fp, 0, SEEK_SET) == -1) 233 return (-1); 234 235 /* The BIOS image must end at 1M */ 236 if ((off = 1048576 - size) < 0) 237 return (-1); 238 239 /* Read BIOS image into memory */ 240 if (mread(fp, off, size) != (size_t)size) { 241 errno = EIO; 242 return (-1); 243 } 244 245 log_debug("%s: loaded BIOS image", __func__); 246 247 return (0); 248 } 249 250 /* 251 * start_vm 252 * 253 * After forking a new VM process, starts the new VM with the creation 254 * parameters supplied (in the incoming vm->vm_params field). This 255 * function performs a basic sanity check on the incoming parameters 256 * and then performs the following steps to complete the creation of the VM: 257 * 258 * 1. validates and create the new VM 259 * 2. opens the imsg control channel to the parent and drops more privilege 260 * 3. drops additional privleges by calling pledge(2) 261 * 4. loads the kernel from the disk image or file descriptor 262 * 5. runs the VM's VCPU loops. 263 * 264 * Parameters: 265 * vm: The VM data structure that is including the VM create parameters. 266 * fd: The imsg socket that is connected to the parent process. 267 * 268 * Return values: 269 * 0: success 270 * !0 : failure - typically an errno indicating the source of the failure 271 */ 272 int 273 start_vm(struct vmd_vm *vm, int fd) 274 { 275 struct vmop_create_params *vmc = &vm->vm_params; 276 struct vm_create_params *vcp = &vmc->vmc_params; 277 struct vcpu_reg_state vrs; 278 int nicfds[VMM_MAX_NICS_PER_VM]; 279 int ret; 280 FILE *fp; 281 struct vmboot_params vmboot; 282 size_t i; 283 struct vm_rwregs_params vrp; 284 285 /* Child */ 286 setproctitle("%s", vcp->vcp_name); 287 log_procinit(vcp->vcp_name); 288 289 if (!(vm->vm_state & VM_STATE_RECEIVED)) 290 create_memory_map(vcp); 291 292 ret = alloc_guest_mem(vcp); 293 294 if (ret) { 295 errno = ret; 296 fatal("could not allocate guest memory - exiting"); 297 } 298 299 ret = vmm_create_vm(vcp); 300 current_vm = vm; 301 302 /* send back the kernel-generated vm id (0 on error) */ 303 if (write(fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) != 304 sizeof(vcp->vcp_id)) 305 fatal("write vcp id"); 306 307 if (ret) { 308 errno = ret; 309 fatal("create vmm ioctl failed - exiting"); 310 } 311 312 /* 313 * pledge in the vm processes: 314 * stdio - for malloc and basic I/O including events. 315 * recvfd - for send/recv. 316 * vmm - for the vmm ioctls and operations. 317 */ 318 if (pledge("stdio vmm recvfd", NULL) == -1) 319 fatal("pledge"); 320 321 if (vm->vm_state & VM_STATE_RECEIVED) { 322 ret = read(vm->vm_receive_fd, &vrp, sizeof(vrp)); 323 if (ret != sizeof(vrp)) { 324 fatal("received incomplete vrp - exiting"); 325 } 326 vrs = vrp.vrwp_regs; 327 } else { 328 /* 329 * Set up default "flat 64 bit" register state - RIP, 330 * RSP, and GDT info will be set in bootloader 331 */ 332 memcpy(&vrs, &vcpu_init_flat64, sizeof(vrs)); 333 334 /* Find and open kernel image */ 335 if ((fp = vmboot_open(vm->vm_kernel, 336 vm->vm_disks[0], vmc->vmc_diskbases[0], 337 vmc->vmc_disktypes[0], &vmboot)) == NULL) 338 fatalx("failed to open kernel - exiting"); 339 340 /* Load kernel image */ 341 ret = loadfile_elf(fp, vcp, &vrs, 342 vmboot.vbp_bootdev, vmboot.vbp_howto, vmc->vmc_bootdevice); 343 344 /* 345 * Try BIOS as a fallback (only if it was provided as an image 346 * with vm->vm_kernel and not loaded from the disk) 347 */ 348 if (ret && errno == ENOEXEC && vm->vm_kernel != -1) 349 ret = loadfile_bios(fp, &vrs); 350 351 if (ret) 352 fatal("failed to load kernel or BIOS - exiting"); 353 354 vmboot_close(fp, &vmboot); 355 } 356 357 if (vm->vm_kernel != -1) 358 close(vm->vm_kernel); 359 360 con_fd = vm->vm_tty; 361 if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1) 362 fatal("failed to set nonblocking mode on console"); 363 364 for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) 365 nicfds[i] = vm->vm_ifs[i].vif_fd; 366 367 event_init(); 368 369 if (vm->vm_state & VM_STATE_RECEIVED) { 370 restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds, 371 vm->vm_disks, vm->vm_cdrom); 372 restore_mem(vm->vm_receive_fd, vcp); 373 if (restore_vm_params(vm->vm_receive_fd, vcp)) 374 fatal("restore vm params failed"); 375 unpause_vm(vcp); 376 } 377 378 if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1) 379 fatal("setup vm pipe"); 380 381 /* Execute the vcpu run loop(s) for this VM */ 382 ret = run_vm(vm->vm_cdrom, vm->vm_disks, nicfds, &vm->vm_params, &vrs); 383 384 /* Ensure that any in-flight data is written back */ 385 virtio_shutdown(vm); 386 387 return (ret); 388 } 389 390 /* 391 * vm_dispatch_vmm 392 * 393 * imsg callback for messages that are received from the vmm parent process. 394 */ 395 void 396 vm_dispatch_vmm(int fd, short event, void *arg) 397 { 398 struct vmd_vm *vm = arg; 399 struct vmop_result vmr; 400 struct imsgev *iev = &vm->vm_iev; 401 struct imsgbuf *ibuf = &iev->ibuf; 402 struct imsg imsg; 403 ssize_t n; 404 int verbose; 405 406 if (event & EV_READ) { 407 if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN) 408 fatal("%s: imsg_read", __func__); 409 if (n == 0) 410 _exit(0); 411 } 412 413 if (event & EV_WRITE) { 414 if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN) 415 fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd); 416 if (n == 0) 417 _exit(0); 418 } 419 420 for (;;) { 421 if ((n = imsg_get(ibuf, &imsg)) == -1) 422 fatal("%s: imsg_get", __func__); 423 if (n == 0) 424 break; 425 426 #if DEBUG > 1 427 log_debug("%s: got imsg %d from %s", 428 __func__, imsg.hdr.type, 429 vm->vm_params.vmc_params.vcp_name); 430 #endif 431 432 switch (imsg.hdr.type) { 433 case IMSG_CTL_VERBOSE: 434 IMSG_SIZE_CHECK(&imsg, &verbose); 435 memcpy(&verbose, imsg.data, sizeof(verbose)); 436 log_setverbose(verbose); 437 break; 438 case IMSG_VMDOP_VM_SHUTDOWN: 439 if (vmmci_ctl(VMMCI_SHUTDOWN) == -1) 440 _exit(0); 441 break; 442 case IMSG_VMDOP_VM_REBOOT: 443 if (vmmci_ctl(VMMCI_REBOOT) == -1) 444 _exit(0); 445 break; 446 case IMSG_VMDOP_PAUSE_VM: 447 vmr.vmr_result = 0; 448 vmr.vmr_id = vm->vm_vmid; 449 pause_vm(&vm->vm_params.vmc_params); 450 imsg_compose_event(&vm->vm_iev, 451 IMSG_VMDOP_PAUSE_VM_RESPONSE, 452 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 453 sizeof(vmr)); 454 break; 455 case IMSG_VMDOP_UNPAUSE_VM: 456 vmr.vmr_result = 0; 457 vmr.vmr_id = vm->vm_vmid; 458 unpause_vm(&vm->vm_params.vmc_params); 459 imsg_compose_event(&vm->vm_iev, 460 IMSG_VMDOP_UNPAUSE_VM_RESPONSE, 461 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 462 sizeof(vmr)); 463 break; 464 case IMSG_VMDOP_SEND_VM_REQUEST: 465 vmr.vmr_id = vm->vm_vmid; 466 vmr.vmr_result = send_vm(imsg.fd, 467 &vm->vm_params.vmc_params); 468 imsg_compose_event(&vm->vm_iev, 469 IMSG_VMDOP_SEND_VM_RESPONSE, 470 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 471 sizeof(vmr)); 472 break; 473 default: 474 fatalx("%s: got invalid imsg %d from %s", 475 __func__, imsg.hdr.type, 476 vm->vm_params.vmc_params.vcp_name); 477 } 478 imsg_free(&imsg); 479 } 480 imsg_event_add(iev); 481 } 482 483 /* 484 * vm_ctl 485 * 486 * Tell the vmm parent process to shutdown or reboot the VM and exit. 487 */ 488 __dead void 489 vm_shutdown(unsigned int cmd) 490 { 491 switch (cmd) { 492 case VMMCI_NONE: 493 case VMMCI_SHUTDOWN: 494 (void)imsg_compose_event(¤t_vm->vm_iev, 495 IMSG_VMDOP_VM_SHUTDOWN, 0, 0, -1, NULL, 0); 496 break; 497 case VMMCI_REBOOT: 498 (void)imsg_compose_event(¤t_vm->vm_iev, 499 IMSG_VMDOP_VM_REBOOT, 0, 0, -1, NULL, 0); 500 break; 501 default: 502 fatalx("invalid vm ctl command: %d", cmd); 503 } 504 imsg_flush(¤t_vm->vm_iev.ibuf); 505 506 _exit(0); 507 } 508 509 int 510 send_vm(int fd, struct vm_create_params *vcp) 511 { 512 struct vm_rwregs_params vrp; 513 struct vm_rwvmparams_params vpp; 514 struct vmop_create_params *vmc; 515 struct vm_terminate_params vtp; 516 unsigned int flags = 0; 517 unsigned int i; 518 int ret = 0; 519 size_t sz; 520 521 if (dump_send_header(fd)) { 522 log_info("%s: failed to send vm dump header", __func__); 523 goto err; 524 } 525 526 pause_vm(vcp); 527 528 vmc = calloc(1, sizeof(struct vmop_create_params)); 529 if (vmc == NULL) { 530 log_warn("%s: calloc error geting vmc", __func__); 531 ret = -1; 532 goto err; 533 } 534 535 flags |= VMOP_CREATE_MEMORY; 536 memcpy(&vmc->vmc_params, ¤t_vm->vm_params, sizeof(struct 537 vmop_create_params)); 538 vmc->vmc_flags = flags; 539 vrp.vrwp_vm_id = vcp->vcp_id; 540 vrp.vrwp_mask = VM_RWREGS_ALL; 541 vpp.vpp_mask = VM_RWVMPARAMS_ALL; 542 vpp.vpp_vm_id = vcp->vcp_id; 543 544 sz = atomicio(vwrite, fd, vmc,sizeof(struct vmop_create_params)); 545 if (sz != sizeof(struct vmop_create_params)) { 546 ret = -1; 547 goto err; 548 } 549 550 for (i = 0; i < vcp->vcp_ncpus; i++) { 551 vrp.vrwp_vcpu_id = i; 552 if ((ret = ioctl(env->vmd_fd, VMM_IOC_READREGS, &vrp))) { 553 log_warn("%s: readregs failed", __func__); 554 goto err; 555 } 556 557 sz = atomicio(vwrite, fd, &vrp, 558 sizeof(struct vm_rwregs_params)); 559 if (sz != sizeof(struct vm_rwregs_params)) { 560 log_warn("%s: dumping registers failed", __func__); 561 ret = -1; 562 goto err; 563 } 564 } 565 566 if ((ret = i8253_dump(fd))) 567 goto err; 568 if ((ret = i8259_dump(fd))) 569 goto err; 570 if ((ret = ns8250_dump(fd))) 571 goto err; 572 if ((ret = mc146818_dump(fd))) 573 goto err; 574 if ((ret = fw_cfg_dump(fd))) 575 goto err; 576 if ((ret = pci_dump(fd))) 577 goto err; 578 if ((ret = virtio_dump(fd))) 579 goto err; 580 if ((ret = dump_mem(fd, vcp))) 581 goto err; 582 583 for (i = 0; i < vcp->vcp_ncpus; i++) { 584 vpp.vpp_vcpu_id = i; 585 if ((ret = ioctl(env->vmd_fd, VMM_IOC_READVMPARAMS, &vpp))) { 586 log_warn("%s: readvmparams failed", __func__); 587 goto err; 588 } 589 590 sz = atomicio(vwrite, fd, &vpp, 591 sizeof(struct vm_rwvmparams_params)); 592 if (sz != sizeof(struct vm_rwvmparams_params)) { 593 log_warn("%s: dumping vm params failed", __func__); 594 ret = -1; 595 goto err; 596 } 597 } 598 599 vtp.vtp_vm_id = vcp->vcp_id; 600 if (ioctl(env->vmd_fd, VMM_IOC_TERM, &vtp) == -1) { 601 log_warnx("%s: term IOC error: %d, %d", __func__, 602 errno, ENOENT); 603 } 604 err: 605 close(fd); 606 if (ret) 607 unpause_vm(vcp); 608 return ret; 609 } 610 611 int 612 dump_send_header(int fd) { 613 struct vm_dump_header vmh; 614 int i; 615 616 memcpy(&vmh.vmh_signature, VM_DUMP_SIGNATURE, 617 sizeof(vmh.vmh_signature)); 618 619 vmh.vmh_cpuids[0].code = 0x00; 620 vmh.vmh_cpuids[0].leaf = 0x00; 621 622 vmh.vmh_cpuids[1].code = 0x01; 623 vmh.vmh_cpuids[1].leaf = 0x00; 624 625 vmh.vmh_cpuids[2].code = 0x07; 626 vmh.vmh_cpuids[2].leaf = 0x00; 627 628 vmh.vmh_cpuids[3].code = 0x0d; 629 vmh.vmh_cpuids[3].leaf = 0x00; 630 631 vmh.vmh_cpuids[4].code = 0x80000001; 632 vmh.vmh_cpuids[4].leaf = 0x00; 633 634 vmh.vmh_version = VM_DUMP_VERSION; 635 636 for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) { 637 CPUID_LEAF(vmh.vmh_cpuids[i].code, 638 vmh.vmh_cpuids[i].leaf, 639 vmh.vmh_cpuids[i].a, 640 vmh.vmh_cpuids[i].b, 641 vmh.vmh_cpuids[i].c, 642 vmh.vmh_cpuids[i].d); 643 } 644 645 if (atomicio(vwrite, fd, &vmh, sizeof(vmh)) != sizeof(vmh)) 646 return (-1); 647 648 return (0); 649 } 650 651 int 652 dump_mem(int fd, struct vm_create_params *vcp) 653 { 654 unsigned int i; 655 int ret; 656 struct vm_mem_range *vmr; 657 658 for (i = 0; i < vcp->vcp_nmemranges; i++) { 659 vmr = &vcp->vcp_memranges[i]; 660 ret = dump_vmr(fd, vmr); 661 if (ret) 662 return ret; 663 } 664 return (0); 665 } 666 667 int 668 restore_vm_params(int fd, struct vm_create_params *vcp) { 669 unsigned int i; 670 struct vm_rwvmparams_params vpp; 671 672 for (i = 0; i < vcp->vcp_ncpus; i++) { 673 if (atomicio(read, fd, &vpp, sizeof(vpp)) != sizeof(vpp)) { 674 log_warn("%s: error restoring vm params", __func__); 675 return (-1); 676 } 677 vpp.vpp_vm_id = vcp->vcp_id; 678 vpp.vpp_vcpu_id = i; 679 if (ioctl(env->vmd_fd, VMM_IOC_WRITEVMPARAMS, &vpp) < 0) { 680 log_debug("%s: writing vm params failed", __func__); 681 return (-1); 682 } 683 } 684 return (0); 685 } 686 687 void 688 restore_mem(int fd, struct vm_create_params *vcp) 689 { 690 unsigned int i; 691 struct vm_mem_range *vmr; 692 693 for (i = 0; i < vcp->vcp_nmemranges; i++) { 694 vmr = &vcp->vcp_memranges[i]; 695 restore_vmr(fd, vmr); 696 } 697 } 698 699 int 700 dump_vmr(int fd, struct vm_mem_range *vmr) 701 { 702 size_t rem = vmr->vmr_size, read=0; 703 char buf[PAGE_SIZE]; 704 705 while (rem > 0) { 706 if (read_mem(vmr->vmr_gpa + read, buf, PAGE_SIZE)) { 707 log_warn("failed to read vmr"); 708 return (-1); 709 } 710 if (atomicio(vwrite, fd, buf, sizeof(buf)) != sizeof(buf)) { 711 log_warn("failed to dump vmr"); 712 return (-1); 713 } 714 rem = rem - PAGE_SIZE; 715 read = read + PAGE_SIZE; 716 } 717 return (0); 718 } 719 720 void 721 restore_vmr(int fd, struct vm_mem_range *vmr) 722 { 723 size_t rem = vmr->vmr_size, wrote=0; 724 char buf[PAGE_SIZE]; 725 726 while (rem > 0) { 727 if (atomicio(read, fd, buf, sizeof(buf)) != sizeof(buf)) 728 fatal("failed to restore vmr"); 729 if (write_mem(vmr->vmr_gpa + wrote, buf, PAGE_SIZE)) 730 fatal("failed to write vmr"); 731 rem = rem - PAGE_SIZE; 732 wrote = wrote + PAGE_SIZE; 733 } 734 } 735 736 void 737 pause_vm(struct vm_create_params *vcp) 738 { 739 unsigned int n; 740 int ret; 741 if (current_vm->vm_state & VM_STATE_PAUSED) 742 return; 743 744 current_vm->vm_state |= VM_STATE_PAUSED; 745 746 for (n = 0; n < vcp->vcp_ncpus; n++) { 747 ret = pthread_mutex_lock(&vcpu_pause_mtx[n]); 748 if (ret) { 749 log_warnx("%s: can't lock vcpu pause mtx (%d)", 750 __func__, (int)ret); 751 return; 752 } 753 754 ret = pthread_cond_broadcast(&vcpu_run_cond[n]); 755 if (ret) { 756 log_warnx("%s: can't broadcast vcpu run cond (%d)", 757 __func__, (int)ret); 758 return; 759 } 760 761 ret = pthread_cond_wait(&vcpu_pause_cond[n], &vcpu_pause_mtx[n]); 762 if (ret) { 763 log_warnx("%s: can't wait on vcpu pause cond (%d)", 764 __func__, (int)ret); 765 return; 766 } 767 ret = pthread_mutex_unlock(&vcpu_pause_mtx[n]); 768 if (ret) { 769 log_warnx("%s: can't unlock vcpu mtx (%d)", 770 __func__, (int)ret); 771 return; 772 } 773 } 774 775 i8253_stop(); 776 mc146818_stop(); 777 ns8250_stop(); 778 virtio_stop(vcp); 779 } 780 781 void 782 unpause_vm(struct vm_create_params *vcp) 783 { 784 unsigned int n; 785 int ret; 786 if (!(current_vm->vm_state & VM_STATE_PAUSED)) 787 return; 788 789 current_vm->vm_state &= ~VM_STATE_PAUSED; 790 for (n = 0; n < vcp->vcp_ncpus; n++) { 791 ret = pthread_cond_broadcast(&vcpu_unpause_cond[n]); 792 if (ret) { 793 log_warnx("%s: can't broadcast vcpu unpause cond (%d)", 794 __func__, (int)ret); 795 return; 796 } 797 } 798 799 i8253_start(); 800 mc146818_start(); 801 ns8250_start(); 802 virtio_start(vcp); 803 } 804 805 /* 806 * vcpu_reset 807 * 808 * Requests vmm(4) to reset the VCPUs in the indicated VM to 809 * the register state provided 810 * 811 * Parameters 812 * vmid: VM ID to reset 813 * vcpu_id: VCPU ID to reset 814 * vrs: the register state to initialize 815 * 816 * Return values: 817 * 0: success 818 * !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not 819 * valid) 820 */ 821 int 822 vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs) 823 { 824 struct vm_resetcpu_params vrp; 825 826 memset(&vrp, 0, sizeof(vrp)); 827 vrp.vrp_vm_id = vmid; 828 vrp.vrp_vcpu_id = vcpu_id; 829 memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state)); 830 831 log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid); 832 833 if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) == -1) 834 return (errno); 835 836 return (0); 837 } 838 839 /* 840 * create_memory_map 841 * 842 * Sets up the guest physical memory ranges that the VM can access. 843 * 844 * Parameters: 845 * vcp: VM create parameters describing the VM whose memory map 846 * is being created 847 * 848 * Return values: 849 * nothing 850 */ 851 void 852 create_memory_map(struct vm_create_params *vcp) 853 { 854 size_t len, mem_bytes, mem_mb; 855 856 mem_mb = vcp->vcp_memranges[0].vmr_size; 857 vcp->vcp_nmemranges = 0; 858 if (mem_mb < 1 || mem_mb > VMM_MAX_VM_MEM_SIZE) 859 return; 860 861 mem_bytes = mem_mb * 1024 * 1024; 862 863 /* First memory region: 0 - LOWMEM_KB (DOS low mem) */ 864 len = LOWMEM_KB * 1024; 865 vcp->vcp_memranges[0].vmr_gpa = 0x0; 866 vcp->vcp_memranges[0].vmr_size = len; 867 mem_bytes -= len; 868 869 /* 870 * Second memory region: LOWMEM_KB - 1MB. 871 * 872 * N.B. - Normally ROMs or parts of video RAM are mapped here. 873 * We have to add this region, because some systems 874 * unconditionally write to 0xb8000 (VGA RAM), and 875 * we need to make sure that vmm(4) permits accesses 876 * to it. So allocate guest memory for it. 877 */ 878 len = 0x100000 - LOWMEM_KB * 1024; 879 vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024; 880 vcp->vcp_memranges[1].vmr_size = len; 881 mem_bytes -= len; 882 883 /* Make sure that we do not place physical memory into MMIO ranges. */ 884 if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - 0x100000) 885 len = VMM_PCI_MMIO_BAR_BASE - 0x100000; 886 else 887 len = mem_bytes; 888 889 /* Third memory region: 1MB - (1MB + len) */ 890 vcp->vcp_memranges[2].vmr_gpa = 0x100000; 891 vcp->vcp_memranges[2].vmr_size = len; 892 mem_bytes -= len; 893 894 if (mem_bytes > 0) { 895 /* Fourth memory region for the remaining memory (if any) */ 896 vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1; 897 vcp->vcp_memranges[3].vmr_size = mem_bytes; 898 vcp->vcp_nmemranges = 4; 899 } else 900 vcp->vcp_nmemranges = 3; 901 } 902 903 /* 904 * alloc_guest_mem 905 * 906 * Allocates memory for the guest. 907 * Instead of doing a single allocation with one mmap(), we allocate memory 908 * separately for every range for the following reasons: 909 * - ASLR for the individual ranges 910 * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to 911 * map the single mmap'd userspace memory to the individual guest physical 912 * memory ranges, the underlying amap of the single mmap'd range would have 913 * to allocate per-page reference counters. The reason is that the 914 * individual guest physical ranges would reference the single mmap'd region 915 * only partially. However, if every guest physical range has its own 916 * corresponding mmap'd userspace allocation, there are no partial 917 * references: every guest physical range fully references an mmap'd 918 * range => no per-page reference counters have to be allocated. 919 * 920 * Return values: 921 * 0: success 922 * !0: failure - errno indicating the source of the failure 923 */ 924 int 925 alloc_guest_mem(struct vm_create_params *vcp) 926 { 927 void *p; 928 int ret; 929 size_t i, j; 930 struct vm_mem_range *vmr; 931 932 for (i = 0; i < vcp->vcp_nmemranges; i++) { 933 vmr = &vcp->vcp_memranges[i]; 934 p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE, 935 MAP_PRIVATE | MAP_ANON, -1, 0); 936 if (p == MAP_FAILED) { 937 ret = errno; 938 for (j = 0; j < i; j++) { 939 vmr = &vcp->vcp_memranges[j]; 940 munmap((void *)vmr->vmr_va, vmr->vmr_size); 941 } 942 943 return (ret); 944 } 945 946 vmr->vmr_va = (vaddr_t)p; 947 } 948 949 return (0); 950 } 951 952 /* 953 * vmm_create_vm 954 * 955 * Requests vmm(4) to create a new VM using the supplied creation 956 * parameters. This operation results in the creation of the in-kernel 957 * structures for the VM, but does not start the VM's vcpu(s). 958 * 959 * Parameters: 960 * vcp: vm_create_params struct containing the VM's desired creation 961 * configuration 962 * 963 * Return values: 964 * 0: success 965 * !0 : ioctl to vmm(4) failed 966 */ 967 int 968 vmm_create_vm(struct vm_create_params *vcp) 969 { 970 /* Sanity check arguments */ 971 if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) 972 return (EINVAL); 973 974 if (vcp->vcp_nmemranges == 0 || 975 vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES) 976 return (EINVAL); 977 978 if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) 979 return (EINVAL); 980 981 if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) 982 return (EINVAL); 983 984 if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) == -1) 985 return (errno); 986 987 return (0); 988 } 989 990 /* 991 * init_emulated_hw 992 * 993 * Initializes the userspace hardware emulation 994 */ 995 void 996 init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom, 997 int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps) 998 { 999 struct vm_create_params *vcp = &vmc->vmc_params; 1000 int i; 1001 uint64_t memlo, memhi; 1002 1003 /* Calculate memory size for NVRAM registers */ 1004 memlo = memhi = 0; 1005 if (vcp->vcp_nmemranges > 2) 1006 memlo = vcp->vcp_memranges[2].vmr_size - 15 * 0x100000; 1007 1008 if (vcp->vcp_nmemranges > 3) 1009 memhi = vcp->vcp_memranges[3].vmr_size; 1010 1011 /* Reset the IO port map */ 1012 memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS); 1013 1014 /* Init i8253 PIT */ 1015 i8253_init(vcp->vcp_id); 1016 ioports_map[TIMER_CTRL] = vcpu_exit_i8253; 1017 ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253; 1018 ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253; 1019 ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253; 1020 ioports_map[PCKBC_AUX] = vcpu_exit_i8253_misc; 1021 1022 /* Init mc146818 RTC */ 1023 mc146818_init(vcp->vcp_id, memlo, memhi); 1024 ioports_map[IO_RTC] = vcpu_exit_mc146818; 1025 ioports_map[IO_RTC + 1] = vcpu_exit_mc146818; 1026 1027 /* Init master and slave PICs */ 1028 i8259_init(); 1029 ioports_map[IO_ICU1] = vcpu_exit_i8259; 1030 ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259; 1031 ioports_map[IO_ICU2] = vcpu_exit_i8259; 1032 ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259; 1033 ioports_map[ELCR0] = vcpu_exit_elcr; 1034 ioports_map[ELCR1] = vcpu_exit_elcr; 1035 1036 /* Init ns8250 UART */ 1037 ns8250_init(con_fd, vcp->vcp_id); 1038 for (i = COM1_DATA; i <= COM1_SCR; i++) 1039 ioports_map[i] = vcpu_exit_com; 1040 1041 /* Init QEMU fw_cfg interface */ 1042 fw_cfg_init(vmc); 1043 ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg; 1044 ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg; 1045 ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma; 1046 ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma; 1047 1048 /* Initialize PCI */ 1049 for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++) 1050 ioports_map[i] = vcpu_exit_pci; 1051 1052 ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci; 1053 ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci; 1054 ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci; 1055 ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci; 1056 ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci; 1057 pci_init(); 1058 1059 /* Initialize virtio devices */ 1060 virtio_init(current_vm, child_cdrom, child_disks, child_taps); 1061 } 1062 /* 1063 * restore_emulated_hw 1064 * 1065 * Restores the userspace hardware emulation from fd 1066 */ 1067 void 1068 restore_emulated_hw(struct vm_create_params *vcp, int fd, 1069 int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom) 1070 { 1071 /* struct vm_create_params *vcp = &vmc->vmc_params; */ 1072 int i; 1073 memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS); 1074 1075 /* Init i8253 PIT */ 1076 i8253_restore(fd, vcp->vcp_id); 1077 ioports_map[TIMER_CTRL] = vcpu_exit_i8253; 1078 ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253; 1079 ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253; 1080 ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253; 1081 1082 /* Init master and slave PICs */ 1083 i8259_restore(fd); 1084 ioports_map[IO_ICU1] = vcpu_exit_i8259; 1085 ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259; 1086 ioports_map[IO_ICU2] = vcpu_exit_i8259; 1087 ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259; 1088 1089 /* Init ns8250 UART */ 1090 ns8250_restore(fd, con_fd, vcp->vcp_id); 1091 for (i = COM1_DATA; i <= COM1_SCR; i++) 1092 ioports_map[i] = vcpu_exit_com; 1093 1094 /* Init mc146818 RTC */ 1095 mc146818_restore(fd, vcp->vcp_id); 1096 ioports_map[IO_RTC] = vcpu_exit_mc146818; 1097 ioports_map[IO_RTC + 1] = vcpu_exit_mc146818; 1098 1099 /* Init QEMU fw_cfg interface */ 1100 fw_cfg_restore(fd); 1101 ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg; 1102 ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg; 1103 ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma; 1104 ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma; 1105 1106 /* Initialize PCI */ 1107 for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++) 1108 ioports_map[i] = vcpu_exit_pci; 1109 1110 ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci; 1111 ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci; 1112 ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci; 1113 ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci; 1114 ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci; 1115 pci_restore(fd); 1116 virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps); 1117 } 1118 1119 /* 1120 * run_vm 1121 * 1122 * Runs the VM whose creation parameters are specified in vcp 1123 * 1124 * Parameters: 1125 * child_cdrom: previously-opened child ISO disk file descriptor 1126 * child_disks: previously-opened child VM disk file file descriptors 1127 * child_taps: previously-opened child tap file descriptors 1128 * vmc: vmop_create_params struct containing the VM's desired creation 1129 * configuration 1130 * vrs: VCPU register state to initialize 1131 * 1132 * Return values: 1133 * 0: the VM exited normally 1134 * !0 : the VM exited abnormally or failed to start 1135 */ 1136 int 1137 run_vm(int child_cdrom, int child_disks[][VM_MAX_BASE_PER_DISK], 1138 int *child_taps, struct vmop_create_params *vmc, 1139 struct vcpu_reg_state *vrs) 1140 { 1141 struct vm_create_params *vcp = &vmc->vmc_params; 1142 struct vm_rwregs_params vregsp; 1143 uint8_t evdone = 0; 1144 size_t i; 1145 int ret; 1146 pthread_t *tid, evtid; 1147 struct vm_run_params **vrp; 1148 void *exit_status; 1149 1150 if (vcp == NULL) 1151 return (EINVAL); 1152 1153 if (child_cdrom == -1 && strlen(vcp->vcp_cdrom)) 1154 return (EINVAL); 1155 1156 if (child_disks == NULL && vcp->vcp_ndisks != 0) 1157 return (EINVAL); 1158 1159 if (child_taps == NULL && vcp->vcp_nnics != 0) 1160 return (EINVAL); 1161 1162 if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) 1163 return (EINVAL); 1164 1165 if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) 1166 return (EINVAL); 1167 1168 if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) 1169 return (EINVAL); 1170 1171 if (vcp->vcp_nmemranges == 0 || 1172 vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES) 1173 return (EINVAL); 1174 1175 tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t)); 1176 vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *)); 1177 if (tid == NULL || vrp == NULL) { 1178 log_warn("%s: memory allocation error - exiting.", 1179 __progname); 1180 return (ENOMEM); 1181 } 1182 1183 log_debug("%s: initializing hardware for vm %s", __func__, 1184 vcp->vcp_name); 1185 1186 if (!(current_vm->vm_state & VM_STATE_RECEIVED)) 1187 init_emulated_hw(vmc, child_cdrom, child_disks, child_taps); 1188 1189 ret = pthread_mutex_init(&threadmutex, NULL); 1190 if (ret) { 1191 log_warn("%s: could not initialize thread state mutex", 1192 __func__); 1193 return (ret); 1194 } 1195 ret = pthread_cond_init(&threadcond, NULL); 1196 if (ret) { 1197 log_warn("%s: could not initialize thread state " 1198 "condition variable", __func__); 1199 return (ret); 1200 } 1201 1202 mutex_lock(&threadmutex); 1203 1204 log_debug("%s: starting vcpu threads for vm %s", __func__, 1205 vcp->vcp_name); 1206 1207 /* 1208 * Create and launch one thread for each VCPU. These threads may 1209 * migrate between PCPUs over time; the need to reload CPU state 1210 * in such situations is detected and performed by vmm(4) in the 1211 * kernel. 1212 */ 1213 for (i = 0 ; i < vcp->vcp_ncpus; i++) { 1214 vrp[i] = malloc(sizeof(struct vm_run_params)); 1215 if (vrp[i] == NULL) { 1216 log_warn("%s: memory allocation error - " 1217 "exiting.", __progname); 1218 /* caller will exit, so skip freeing */ 1219 return (ENOMEM); 1220 } 1221 vrp[i]->vrp_exit = malloc(sizeof(struct vm_exit)); 1222 if (vrp[i]->vrp_exit == NULL) { 1223 log_warn("%s: memory allocation error - " 1224 "exiting.", __progname); 1225 /* caller will exit, so skip freeing */ 1226 return (ENOMEM); 1227 } 1228 vrp[i]->vrp_vm_id = vcp->vcp_id; 1229 vrp[i]->vrp_vcpu_id = i; 1230 1231 if (vcpu_reset(vcp->vcp_id, i, vrs)) { 1232 log_warnx("%s: cannot reset VCPU %zu - exiting.", 1233 __progname, i); 1234 return (EIO); 1235 } 1236 1237 /* once more because reset_cpu changes regs */ 1238 if (current_vm->vm_state & VM_STATE_RECEIVED) { 1239 vregsp.vrwp_vm_id = vcp->vcp_id; 1240 vregsp.vrwp_vcpu_id = i; 1241 vregsp.vrwp_regs = *vrs; 1242 vregsp.vrwp_mask = VM_RWREGS_ALL; 1243 if ((ret = ioctl(env->vmd_fd, VMM_IOC_WRITEREGS, 1244 &vregsp)) == -1) { 1245 log_warn("%s: writeregs failed", __func__); 1246 return (ret); 1247 } 1248 } 1249 1250 ret = pthread_cond_init(&vcpu_run_cond[i], NULL); 1251 if (ret) { 1252 log_warnx("%s: cannot initialize cond var (%d)", 1253 __progname, ret); 1254 return (ret); 1255 } 1256 1257 ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL); 1258 if (ret) { 1259 log_warnx("%s: cannot initialize mtx (%d)", 1260 __progname, ret); 1261 return (ret); 1262 } 1263 ret = pthread_cond_init(&vcpu_pause_cond[i], NULL); 1264 if (ret) { 1265 log_warnx("%s: cannot initialize pause cond var (%d)", 1266 __progname, ret); 1267 return (ret); 1268 } 1269 1270 ret = pthread_mutex_init(&vcpu_pause_mtx[i], NULL); 1271 if (ret) { 1272 log_warnx("%s: cannot initialize pause mtx (%d)", 1273 __progname, ret); 1274 return (ret); 1275 } 1276 ret = pthread_cond_init(&vcpu_unpause_cond[i], NULL); 1277 if (ret) { 1278 log_warnx("%s: cannot initialize unpause var (%d)", 1279 __progname, ret); 1280 return (ret); 1281 } 1282 1283 ret = pthread_mutex_init(&vcpu_unpause_mtx[i], NULL); 1284 if (ret) { 1285 log_warnx("%s: cannot initialize unpause mtx (%d)", 1286 __progname, ret); 1287 return (ret); 1288 } 1289 1290 vcpu_hlt[i] = 0; 1291 1292 /* Start each VCPU run thread at vcpu_run_loop */ 1293 ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]); 1294 if (ret) { 1295 /* caller will _exit after this return */ 1296 ret = errno; 1297 log_warn("%s: could not create vcpu thread %zu", 1298 __func__, i); 1299 return (ret); 1300 } 1301 } 1302 1303 log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name); 1304 ret = pthread_create(&evtid, NULL, event_thread, &evdone); 1305 if (ret) { 1306 errno = ret; 1307 log_warn("%s: could not create event thread", __func__); 1308 return (ret); 1309 } 1310 1311 for (;;) { 1312 ret = pthread_cond_wait(&threadcond, &threadmutex); 1313 if (ret) { 1314 log_warn("%s: waiting on thread state condition " 1315 "variable failed", __func__); 1316 return (ret); 1317 } 1318 1319 /* 1320 * Did a VCPU thread exit with an error? => return the first one 1321 */ 1322 for (i = 0; i < vcp->vcp_ncpus; i++) { 1323 if (vcpu_done[i] == 0) 1324 continue; 1325 1326 if (pthread_join(tid[i], &exit_status)) { 1327 log_warn("%s: failed to join thread %zd - " 1328 "exiting", __progname, i); 1329 return (EIO); 1330 } 1331 1332 ret = (intptr_t)exit_status; 1333 } 1334 1335 /* Did the event thread exit? => return with an error */ 1336 if (evdone) { 1337 if (pthread_join(evtid, &exit_status)) { 1338 log_warn("%s: failed to join event thread - " 1339 "exiting", __progname); 1340 return (EIO); 1341 } 1342 1343 log_warnx("%s: vm %d event thread exited " 1344 "unexpectedly", __progname, vcp->vcp_id); 1345 return (EIO); 1346 } 1347 1348 /* Did all VCPU threads exit successfully? => return */ 1349 for (i = 0; i < vcp->vcp_ncpus; i++) { 1350 if (vcpu_done[i] == 0) 1351 break; 1352 } 1353 if (i == vcp->vcp_ncpus) 1354 return (ret); 1355 1356 /* Some more threads to wait for, start over */ 1357 } 1358 1359 return (ret); 1360 } 1361 1362 void * 1363 event_thread(void *arg) 1364 { 1365 uint8_t *donep = arg; 1366 intptr_t ret; 1367 1368 ret = event_dispatch(); 1369 1370 mutex_lock(&threadmutex); 1371 *donep = 1; 1372 pthread_cond_signal(&threadcond); 1373 mutex_unlock(&threadmutex); 1374 1375 return (void *)ret; 1376 } 1377 1378 /* 1379 * vcpu_run_loop 1380 * 1381 * Runs a single VCPU until vmm(4) requires help handling an exit, 1382 * or the VM terminates. 1383 * 1384 * Parameters: 1385 * arg: vcpu_run_params for the VCPU being run by this thread 1386 * 1387 * Return values: 1388 * NULL: the VCPU shutdown properly 1389 * !NULL: error processing VCPU run, or the VCPU shutdown abnormally 1390 */ 1391 void * 1392 vcpu_run_loop(void *arg) 1393 { 1394 struct vm_run_params *vrp = (struct vm_run_params *)arg; 1395 intptr_t ret = 0; 1396 int irq; 1397 uint32_t n; 1398 1399 vrp->vrp_continue = 0; 1400 n = vrp->vrp_vcpu_id; 1401 1402 for (;;) { 1403 ret = pthread_mutex_lock(&vcpu_run_mtx[n]); 1404 1405 if (ret) { 1406 log_warnx("%s: can't lock vcpu run mtx (%d)", 1407 __func__, (int)ret); 1408 return ((void *)ret); 1409 } 1410 1411 /* If we are halted and need to pause, pause */ 1412 if (vcpu_hlt[n] && (current_vm->vm_state & VM_STATE_PAUSED)) { 1413 ret = pthread_cond_broadcast(&vcpu_pause_cond[n]); 1414 if (ret) { 1415 log_warnx("%s: can't broadcast vcpu pause mtx" 1416 "(%d)", __func__, (int)ret); 1417 return ((void *)ret); 1418 } 1419 1420 ret = pthread_mutex_lock(&vcpu_unpause_mtx[n]); 1421 if (ret) { 1422 log_warnx("%s: can't lock vcpu unpause mtx (%d)", 1423 __func__, (int)ret); 1424 return ((void *)ret); 1425 } 1426 1427 ret = pthread_cond_wait(&vcpu_unpause_cond[n], 1428 &vcpu_unpause_mtx[n]); 1429 if (ret) { 1430 log_warnx( 1431 "%s: can't wait on unpause cond (%d)", 1432 __func__, (int)ret); 1433 break; 1434 } 1435 ret = pthread_mutex_unlock(&vcpu_unpause_mtx[n]); 1436 if (ret) { 1437 log_warnx("%s: can't unlock unpause mtx (%d)", 1438 __func__, (int)ret); 1439 break; 1440 } 1441 } 1442 1443 /* If we are halted and not paused, wait */ 1444 if (vcpu_hlt[n]) { 1445 ret = pthread_cond_wait(&vcpu_run_cond[n], 1446 &vcpu_run_mtx[n]); 1447 1448 if (ret) { 1449 log_warnx( 1450 "%s: can't wait on cond (%d)", 1451 __func__, (int)ret); 1452 (void)pthread_mutex_unlock( 1453 &vcpu_run_mtx[n]); 1454 break; 1455 } 1456 } 1457 1458 ret = pthread_mutex_unlock(&vcpu_run_mtx[n]); 1459 1460 if (ret) { 1461 log_warnx("%s: can't unlock mutex on cond (%d)", 1462 __func__, (int)ret); 1463 break; 1464 } 1465 1466 if (vrp->vrp_irqready && i8259_is_pending()) { 1467 irq = i8259_ack(); 1468 vrp->vrp_irq = irq; 1469 } else 1470 vrp->vrp_irq = 0xFFFF; 1471 1472 /* Still more pending? */ 1473 if (i8259_is_pending()) { 1474 /* XXX can probably avoid ioctls here by providing intr in vrp */ 1475 if (vcpu_pic_intr(vrp->vrp_vm_id, 1476 vrp->vrp_vcpu_id, 1)) { 1477 fatal("can't set INTR"); 1478 } 1479 } else { 1480 if (vcpu_pic_intr(vrp->vrp_vm_id, 1481 vrp->vrp_vcpu_id, 0)) { 1482 fatal("can't clear INTR"); 1483 } 1484 } 1485 1486 if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) == -1) { 1487 /* If run ioctl failed, exit */ 1488 ret = errno; 1489 log_warn("%s: vm %d / vcpu %d run ioctl failed", 1490 __func__, vrp->vrp_vm_id, n); 1491 break; 1492 } 1493 1494 /* If the VM is terminating, exit normally */ 1495 if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) { 1496 ret = (intptr_t)NULL; 1497 break; 1498 } 1499 1500 if (vrp->vrp_exit_reason != VM_EXIT_NONE) { 1501 /* 1502 * vmm(4) needs help handling an exit, handle in 1503 * vcpu_exit. 1504 */ 1505 ret = vcpu_exit(vrp); 1506 if (ret) 1507 break; 1508 } 1509 } 1510 1511 mutex_lock(&threadmutex); 1512 vcpu_done[n] = 1; 1513 pthread_cond_signal(&threadcond); 1514 mutex_unlock(&threadmutex); 1515 1516 return ((void *)ret); 1517 } 1518 1519 int 1520 vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr) 1521 { 1522 struct vm_intr_params vip; 1523 1524 memset(&vip, 0, sizeof(vip)); 1525 1526 vip.vip_vm_id = vm_id; 1527 vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */ 1528 vip.vip_intr = intr; 1529 1530 if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) == -1) 1531 return (errno); 1532 1533 return (0); 1534 } 1535 1536 /* 1537 * vcpu_exit_pci 1538 * 1539 * Handle all I/O to the emulated PCI subsystem. 1540 * 1541 * Parameters: 1542 * vrp: vcpu run paramters containing guest state for this exit 1543 * 1544 * Return value: 1545 * Interrupt to inject to the guest VM, or 0xFF if no interrupt should 1546 * be injected. 1547 */ 1548 uint8_t 1549 vcpu_exit_pci(struct vm_run_params *vrp) 1550 { 1551 struct vm_exit *vei = vrp->vrp_exit; 1552 uint8_t intr; 1553 1554 intr = 0xFF; 1555 1556 switch (vei->vei.vei_port) { 1557 case PCI_MODE1_ADDRESS_REG: 1558 pci_handle_address_reg(vrp); 1559 break; 1560 case PCI_MODE1_DATA_REG: 1561 case PCI_MODE1_DATA_REG + 1: 1562 case PCI_MODE1_DATA_REG + 2: 1563 case PCI_MODE1_DATA_REG + 3: 1564 pci_handle_data_reg(vrp); 1565 break; 1566 case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END: 1567 intr = pci_handle_io(vrp); 1568 break; 1569 default: 1570 log_warnx("%s: unknown PCI register 0x%llx", 1571 __progname, (uint64_t)vei->vei.vei_port); 1572 break; 1573 } 1574 1575 return (intr); 1576 } 1577 1578 /* 1579 * vcpu_exit_inout 1580 * 1581 * Handle all I/O exits that need to be emulated in vmd. This includes the 1582 * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device. 1583 * 1584 * Parameters: 1585 * vrp: vcpu run parameters containing guest state for this exit 1586 */ 1587 void 1588 vcpu_exit_inout(struct vm_run_params *vrp) 1589 { 1590 struct vm_exit *vei = vrp->vrp_exit; 1591 uint8_t intr = 0xFF; 1592 1593 if (ioports_map[vei->vei.vei_port] != NULL) 1594 intr = ioports_map[vei->vei.vei_port](vrp); 1595 else if (vei->vei.vei_dir == VEI_DIR_IN) 1596 set_return_data(vei, 0xFFFFFFFF); 1597 1598 if (intr != 0xFF) 1599 vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr); 1600 } 1601 1602 /* 1603 * vcpu_exit 1604 * 1605 * Handle a vcpu exit. This function is called when it is determined that 1606 * vmm(4) requires the assistance of vmd to support a particular guest 1607 * exit type (eg, accessing an I/O port or device). Guest state is contained 1608 * in 'vrp', and will be resent to vmm(4) on exit completion. 1609 * 1610 * Upon conclusion of handling the exit, the function determines if any 1611 * interrupts should be injected into the guest, and asserts the proper 1612 * IRQ line whose interrupt should be vectored. 1613 * 1614 * Parameters: 1615 * vrp: vcpu run parameters containing guest state for this exit 1616 * 1617 * Return values: 1618 * 0: the exit was handled successfully 1619 * 1: an error occurred (eg, unknown exit reason passed in 'vrp') 1620 */ 1621 int 1622 vcpu_exit(struct vm_run_params *vrp) 1623 { 1624 int ret; 1625 1626 switch (vrp->vrp_exit_reason) { 1627 case VMX_EXIT_INT_WINDOW: 1628 case SVM_VMEXIT_VINTR: 1629 case VMX_EXIT_CPUID: 1630 case VMX_EXIT_EXTINT: 1631 case SVM_VMEXIT_INTR: 1632 case VMX_EXIT_EPT_VIOLATION: 1633 case SVM_VMEXIT_NPF: 1634 case SVM_VMEXIT_MSR: 1635 case SVM_VMEXIT_CPUID: 1636 /* 1637 * We may be exiting to vmd to handle a pending interrupt but 1638 * at the same time the last exit type may have been one of 1639 * these. In this case, there's nothing extra to be done 1640 * here (and falling through to the default case below results 1641 * in more vmd log spam). 1642 */ 1643 break; 1644 case VMX_EXIT_IO: 1645 case SVM_VMEXIT_IOIO: 1646 vcpu_exit_inout(vrp); 1647 break; 1648 case VMX_EXIT_HLT: 1649 case SVM_VMEXIT_HLT: 1650 ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]); 1651 if (ret) { 1652 log_warnx("%s: can't lock vcpu mutex (%d)", 1653 __func__, ret); 1654 return (ret); 1655 } 1656 vcpu_hlt[vrp->vrp_vcpu_id] = 1; 1657 ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]); 1658 if (ret) { 1659 log_warnx("%s: can't unlock vcpu mutex (%d)", 1660 __func__, ret); 1661 return (ret); 1662 } 1663 break; 1664 case VMX_EXIT_TRIPLE_FAULT: 1665 case SVM_VMEXIT_SHUTDOWN: 1666 /* reset VM */ 1667 return (EAGAIN); 1668 default: 1669 log_debug("%s: unknown exit reason 0x%x", 1670 __progname, vrp->vrp_exit_reason); 1671 } 1672 1673 /* Process any pending traffic */ 1674 vionet_process_rx(vrp->vrp_vm_id); 1675 1676 vrp->vrp_continue = 1; 1677 1678 return (0); 1679 } 1680 1681 /* 1682 * find_gpa_range 1683 * 1684 * Search for a contiguous guest physical mem range. 1685 * 1686 * Parameters: 1687 * vcp: VM create parameters that contain the memory map to search in 1688 * gpa: the starting guest physical address 1689 * len: the length of the memory range 1690 * 1691 * Return values: 1692 * NULL: on failure if there is no memory range as described by the parameters 1693 * Pointer to vm_mem_range that contains the start of the range otherwise. 1694 */ 1695 static struct vm_mem_range * 1696 find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len) 1697 { 1698 size_t i, n; 1699 struct vm_mem_range *vmr; 1700 1701 /* Find the first vm_mem_range that contains gpa */ 1702 for (i = 0; i < vcp->vcp_nmemranges; i++) { 1703 vmr = &vcp->vcp_memranges[i]; 1704 if (vmr->vmr_gpa + vmr->vmr_size >= gpa) 1705 break; 1706 } 1707 1708 /* No range found. */ 1709 if (i == vcp->vcp_nmemranges) 1710 return (NULL); 1711 1712 /* 1713 * vmr may cover the range [gpa, gpa + len) only partly. Make 1714 * sure that the following vm_mem_ranges are contiguous and 1715 * cover the rest. 1716 */ 1717 n = vmr->vmr_size - (gpa - vmr->vmr_gpa); 1718 if (len < n) 1719 len = 0; 1720 else 1721 len -= n; 1722 gpa = vmr->vmr_gpa + vmr->vmr_size; 1723 for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) { 1724 vmr = &vcp->vcp_memranges[i]; 1725 if (gpa != vmr->vmr_gpa) 1726 return (NULL); 1727 if (len <= vmr->vmr_size) 1728 len = 0; 1729 else 1730 len -= vmr->vmr_size; 1731 1732 gpa = vmr->vmr_gpa + vmr->vmr_size; 1733 } 1734 1735 if (len != 0) 1736 return (NULL); 1737 1738 return (vmr); 1739 } 1740 1741 void * 1742 vaddr_mem(paddr_t gpa, size_t len) 1743 { 1744 struct vm_create_params *vcp = ¤t_vm->vm_params.vmc_params; 1745 size_t i; 1746 struct vm_mem_range *vmr; 1747 paddr_t gpend = gpa + len; 1748 1749 /* Find the first vm_mem_range that contains gpa */ 1750 for (i = 0; i < vcp->vcp_nmemranges; i++) { 1751 vmr = &vcp->vcp_memranges[i]; 1752 if (gpa < vmr->vmr_gpa) 1753 continue; 1754 1755 if (gpend >= vmr->vmr_gpa + vmr->vmr_size) 1756 continue; 1757 1758 return ((char *)vmr->vmr_va + (gpa - vmr->vmr_gpa)); 1759 } 1760 1761 return (NULL); 1762 } 1763 1764 /* 1765 * write_mem 1766 * 1767 * Copies data from 'buf' into the guest VM's memory at paddr 'dst'. 1768 * 1769 * Parameters: 1770 * dst: the destination paddr_t in the guest VM 1771 * buf: data to copy (or NULL to zero the data) 1772 * len: number of bytes to copy 1773 * 1774 * Return values: 1775 * 0: success 1776 * EINVAL: if the guest physical memory range [dst, dst + len) does not 1777 * exist in the guest. 1778 */ 1779 int 1780 write_mem(paddr_t dst, const void *buf, size_t len) 1781 { 1782 const char *from = buf; 1783 char *to; 1784 size_t n, off; 1785 struct vm_mem_range *vmr; 1786 1787 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, dst, len); 1788 if (vmr == NULL) { 1789 errno = EINVAL; 1790 log_warn("%s: failed - invalid memory range dst = 0x%lx, " 1791 "len = 0x%zx", __func__, dst, len); 1792 return (EINVAL); 1793 } 1794 1795 off = dst - vmr->vmr_gpa; 1796 while (len != 0) { 1797 n = vmr->vmr_size - off; 1798 if (len < n) 1799 n = len; 1800 1801 to = (char *)vmr->vmr_va + off; 1802 if (buf == NULL) 1803 memset(to, 0, n); 1804 else { 1805 memcpy(to, from, n); 1806 from += n; 1807 } 1808 len -= n; 1809 off = 0; 1810 vmr++; 1811 } 1812 1813 return (0); 1814 } 1815 1816 /* 1817 * read_mem 1818 * 1819 * Reads memory at guest paddr 'src' into 'buf'. 1820 * 1821 * Parameters: 1822 * src: the source paddr_t in the guest VM to read from. 1823 * buf: destination (local) buffer 1824 * len: number of bytes to read 1825 * 1826 * Return values: 1827 * 0: success 1828 * EINVAL: if the guest physical memory range [dst, dst + len) does not 1829 * exist in the guest. 1830 */ 1831 int 1832 read_mem(paddr_t src, void *buf, size_t len) 1833 { 1834 char *from, *to = buf; 1835 size_t n, off; 1836 struct vm_mem_range *vmr; 1837 1838 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, src, len); 1839 if (vmr == NULL) { 1840 errno = EINVAL; 1841 log_warn("%s: failed - invalid memory range src = 0x%lx, " 1842 "len = 0x%zx", __func__, src, len); 1843 return (EINVAL); 1844 } 1845 1846 off = src - vmr->vmr_gpa; 1847 while (len != 0) { 1848 n = vmr->vmr_size - off; 1849 if (len < n) 1850 n = len; 1851 1852 from = (char *)vmr->vmr_va + off; 1853 memcpy(to, from, n); 1854 1855 to += n; 1856 len -= n; 1857 off = 0; 1858 vmr++; 1859 } 1860 1861 return (0); 1862 } 1863 1864 int 1865 iovec_mem(paddr_t src, size_t len, struct iovec *iov, int iovcnt) 1866 { 1867 size_t n, off; 1868 struct vm_mem_range *vmr; 1869 int niov = 0; 1870 1871 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, src, len); 1872 if (vmr == NULL) { 1873 errno = EINVAL; 1874 return (-1); 1875 } 1876 1877 off = src - vmr->vmr_gpa; 1878 while (len > 0) { 1879 if (niov == iovcnt) { 1880 errno = ENOMEM; 1881 return (-1); 1882 } 1883 1884 n = vmr->vmr_size - off; 1885 if (len < n) 1886 n = len; 1887 1888 iov[niov].iov_base = (char *)vmr->vmr_va + off; 1889 iov[niov].iov_len = n; 1890 1891 niov++; 1892 1893 len -= n; 1894 off = 0; 1895 vmr++; 1896 } 1897 1898 return (niov); 1899 } 1900 1901 /* 1902 * vcpu_assert_pic_irq 1903 * 1904 * Injects the specified IRQ on the supplied vcpu/vm 1905 * 1906 * Parameters: 1907 * vm_id: VM ID to inject to 1908 * vcpu_id: VCPU ID to inject to 1909 * irq: IRQ to inject 1910 */ 1911 void 1912 vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) 1913 { 1914 int ret; 1915 1916 i8259_assert_irq(irq); 1917 1918 if (i8259_is_pending()) { 1919 if (vcpu_pic_intr(vm_id, vcpu_id, 1)) 1920 fatalx("%s: can't assert INTR", __func__); 1921 1922 ret = pthread_mutex_lock(&vcpu_run_mtx[vcpu_id]); 1923 if (ret) 1924 fatalx("%s: can't lock vcpu mtx (%d)", __func__, ret); 1925 1926 vcpu_hlt[vcpu_id] = 0; 1927 ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]); 1928 if (ret) 1929 fatalx("%s: can't signal (%d)", __func__, ret); 1930 ret = pthread_mutex_unlock(&vcpu_run_mtx[vcpu_id]); 1931 if (ret) 1932 fatalx("%s: can't unlock vcpu mtx (%d)", __func__, ret); 1933 } 1934 } 1935 1936 /* 1937 * vcpu_deassert_pic_irq 1938 * 1939 * Clears the specified IRQ on the supplied vcpu/vm 1940 * 1941 * Parameters: 1942 * vm_id: VM ID to clear in 1943 * vcpu_id: VCPU ID to clear in 1944 * irq: IRQ to clear 1945 */ 1946 void 1947 vcpu_deassert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) 1948 { 1949 i8259_deassert_irq(irq); 1950 1951 if (!i8259_is_pending()) { 1952 if (vcpu_pic_intr(vm_id, vcpu_id, 0)) 1953 fatalx("%s: can't deassert INTR for vm_id %d, " 1954 "vcpu_id %d", __func__, vm_id, vcpu_id); 1955 } 1956 } 1957 1958 /* 1959 * fd_hasdata 1960 * 1961 * Determines if data can be read from a file descriptor. 1962 * 1963 * Parameters: 1964 * fd: the fd to check 1965 * 1966 * Return values: 1967 * 1 if data can be read from an fd, or 0 otherwise. 1968 */ 1969 int 1970 fd_hasdata(int fd) 1971 { 1972 struct pollfd pfd[1]; 1973 int nready, hasdata = 0; 1974 1975 pfd[0].fd = fd; 1976 pfd[0].events = POLLIN; 1977 nready = poll(pfd, 1, 0); 1978 if (nready == -1) 1979 log_warn("checking file descriptor for data failed"); 1980 else if (nready == 1 && pfd[0].revents & POLLIN) 1981 hasdata = 1; 1982 return (hasdata); 1983 } 1984 1985 /* 1986 * mutex_lock 1987 * 1988 * Wrapper function for pthread_mutex_lock that does error checking and that 1989 * exits on failure 1990 */ 1991 void 1992 mutex_lock(pthread_mutex_t *m) 1993 { 1994 int ret; 1995 1996 ret = pthread_mutex_lock(m); 1997 if (ret) { 1998 errno = ret; 1999 fatal("could not acquire mutex"); 2000 } 2001 } 2002 2003 /* 2004 * mutex_unlock 2005 * 2006 * Wrapper function for pthread_mutex_unlock that does error checking and that 2007 * exits on failure 2008 */ 2009 void 2010 mutex_unlock(pthread_mutex_t *m) 2011 { 2012 int ret; 2013 2014 ret = pthread_mutex_unlock(m); 2015 if (ret) { 2016 errno = ret; 2017 fatal("could not release mutex"); 2018 } 2019 } 2020 2021 /* 2022 * set_return_data 2023 * 2024 * Utility function for manipulating register data in vm exit info structs. This 2025 * function ensures that the data is copied to the vei->vei.vei_data field with 2026 * the proper size for the operation being performed. 2027 * 2028 * Parameters: 2029 * vei: exit information 2030 * data: return data 2031 */ 2032 void 2033 set_return_data(struct vm_exit *vei, uint32_t data) 2034 { 2035 switch (vei->vei.vei_size) { 2036 case 1: 2037 vei->vei.vei_data &= ~0xFF; 2038 vei->vei.vei_data |= (uint8_t)data; 2039 break; 2040 case 2: 2041 vei->vei.vei_data &= ~0xFFFF; 2042 vei->vei.vei_data |= (uint16_t)data; 2043 break; 2044 case 4: 2045 vei->vei.vei_data = data; 2046 break; 2047 } 2048 } 2049 2050 /* 2051 * get_input_data 2052 * 2053 * Utility function for manipulating register data in vm exit info 2054 * structs. This function ensures that the data is copied from the 2055 * vei->vei.vei_data field with the proper size for the operation being 2056 * performed. 2057 * 2058 * Parameters: 2059 * vei: exit information 2060 * data: location to store the result 2061 */ 2062 void 2063 get_input_data(struct vm_exit *vei, uint32_t *data) 2064 { 2065 switch (vei->vei.vei_size) { 2066 case 1: 2067 *data &= 0xFFFFFF00; 2068 *data |= (uint8_t)vei->vei.vei_data; 2069 break; 2070 case 2: 2071 *data &= 0xFFFF0000; 2072 *data |= (uint16_t)vei->vei.vei_data; 2073 break; 2074 case 4: 2075 *data = vei->vei.vei_data; 2076 break; 2077 default: 2078 log_warnx("%s: invalid i/o size %d", __func__, 2079 vei->vei.vei_size); 2080 } 2081 2082 } 2083 2084 /* 2085 * translate_gva 2086 * 2087 * Translates a guest virtual address to a guest physical address by walking 2088 * the currently active page table (if needed). 2089 * 2090 * Note - this function can possibly alter the supplied VCPU state. 2091 * Specifically, it may inject exceptions depending on the current VCPU 2092 * configuration, and may alter %cr2 on #PF. Consequently, this function 2093 * should only be used as part of instruction emulation. 2094 * 2095 * Parameters: 2096 * exit: The VCPU this translation should be performed for (guest MMU settings 2097 * are gathered from this VCPU) 2098 * va: virtual address to translate 2099 * pa: pointer to paddr_t variable that will receive the translated physical 2100 * address. 'pa' is unchanged on error. 2101 * mode: one of PROT_READ, PROT_WRITE, PROT_EXEC indicating the mode in which 2102 * the address should be translated 2103 * 2104 * Return values: 2105 * 0: the address was successfully translated - 'pa' contains the physical 2106 * address currently mapped by 'va'. 2107 * EFAULT: the PTE for 'VA' is unmapped. A #PF will be injected in this case 2108 * and %cr2 set in the vcpu structure. 2109 * EINVAL: an error occurred reading paging table structures 2110 */ 2111 int 2112 translate_gva(struct vm_exit* exit, uint64_t va, uint64_t* pa, int mode) 2113 { 2114 int level, shift, pdidx; 2115 uint64_t pte, pt_paddr, pte_paddr, mask, low_mask, high_mask; 2116 uint64_t shift_width, pte_size; 2117 struct vcpu_reg_state *vrs; 2118 2119 vrs = &exit->vrs; 2120 2121 if (!pa) 2122 return (EINVAL); 2123 2124 if (!(vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PG)) { 2125 log_debug("%s: unpaged, va=pa=0x%llx", __func__, va); 2126 *pa = va; 2127 return (0); 2128 } 2129 2130 pt_paddr = vrs->vrs_crs[VCPU_REGS_CR3]; 2131 2132 log_debug("%s: guest %%cr0=0x%llx, %%cr3=0x%llx", __func__, 2133 vrs->vrs_crs[VCPU_REGS_CR0], vrs->vrs_crs[VCPU_REGS_CR3]); 2134 2135 if (vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PE) { 2136 if (vrs->vrs_crs[VCPU_REGS_CR4] & CR4_PAE) { 2137 pte_size = sizeof(uint64_t); 2138 shift_width = 9; 2139 2140 if (vrs->vrs_msrs[VCPU_REGS_EFER] & EFER_LMA) { 2141 /* 4 level paging */ 2142 level = 4; 2143 mask = L4_MASK; 2144 shift = L4_SHIFT; 2145 } else { 2146 /* 32 bit with PAE paging */ 2147 level = 3; 2148 mask = L3_MASK; 2149 shift = L3_SHIFT; 2150 } 2151 } else { 2152 /* 32 bit paging */ 2153 level = 2; 2154 shift_width = 10; 2155 mask = 0xFFC00000; 2156 shift = 22; 2157 pte_size = sizeof(uint32_t); 2158 } 2159 } else 2160 return (EINVAL); 2161 2162 /* XXX: Check for R bit in segment selector and set A bit */ 2163 2164 for (;level > 0; level--) { 2165 pdidx = (va & mask) >> shift; 2166 pte_paddr = (pt_paddr) + (pdidx * pte_size); 2167 2168 log_debug("%s: read pte level %d @ GPA 0x%llx", __func__, 2169 level, pte_paddr); 2170 if (read_mem(pte_paddr, &pte, pte_size)) { 2171 log_warn("%s: failed to read pte", __func__); 2172 return (EFAULT); 2173 } 2174 2175 log_debug("%s: PTE @ 0x%llx = 0x%llx", __func__, pte_paddr, 2176 pte); 2177 2178 /* XXX: Set CR2 */ 2179 if (!(pte & PG_V)) 2180 return (EFAULT); 2181 2182 /* XXX: Check for SMAP */ 2183 if ((mode == PROT_WRITE) && !(pte & PG_RW)) 2184 return (EPERM); 2185 2186 if ((exit->cpl > 0) && !(pte & PG_u)) 2187 return (EPERM); 2188 2189 pte = pte | PG_U; 2190 if (mode == PROT_WRITE) 2191 pte = pte | PG_M; 2192 if (write_mem(pte_paddr, &pte, pte_size)) { 2193 log_warn("%s: failed to write back flags to pte", 2194 __func__); 2195 return (EIO); 2196 } 2197 2198 /* XXX: EINVAL if in 32bit and PG_PS is 1 but CR4.PSE is 0 */ 2199 if (pte & PG_PS) 2200 break; 2201 2202 if (level > 1) { 2203 pt_paddr = pte & PG_FRAME; 2204 shift -= shift_width; 2205 mask = mask >> shift_width; 2206 } 2207 } 2208 2209 low_mask = (1 << shift) - 1; 2210 high_mask = (((uint64_t)1ULL << ((pte_size * 8) - 1)) - 1) ^ low_mask; 2211 *pa = (pte & high_mask) | (va & low_mask); 2212 2213 log_debug("%s: final GPA for GVA 0x%llx = 0x%llx\n", __func__, va, *pa); 2214 2215 return (0); 2216 } 2217