1 /* $OpenBSD: vm.c,v 1.68 2022/03/01 21:46:19 dv Exp $ */ 2 3 /* 4 * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/param.h> /* PAGE_SIZE */ 20 #include <sys/types.h> 21 #include <sys/ioctl.h> 22 #include <sys/queue.h> 23 #include <sys/wait.h> 24 #include <sys/uio.h> 25 #include <sys/stat.h> 26 #include <sys/socket.h> 27 #include <sys/time.h> 28 #include <sys/mman.h> 29 #include <sys/resource.h> 30 31 #include <dev/ic/i8253reg.h> 32 #include <dev/isa/isareg.h> 33 #include <dev/pci/pcireg.h> 34 35 #include <machine/psl.h> 36 #include <machine/pte.h> 37 #include <machine/specialreg.h> 38 #include <machine/vmmvar.h> 39 40 #include <net/if.h> 41 42 #include <errno.h> 43 #include <event.h> 44 #include <fcntl.h> 45 #include <imsg.h> 46 #include <limits.h> 47 #include <poll.h> 48 #include <pthread.h> 49 #include <stddef.h> 50 #include <stdio.h> 51 #include <stdlib.h> 52 #include <string.h> 53 #include <unistd.h> 54 #include <util.h> 55 56 #include "atomicio.h" 57 #include "fw_cfg.h" 58 #include "i8253.h" 59 #include "i8259.h" 60 #include "loadfile.h" 61 #include "mc146818.h" 62 #include "ns8250.h" 63 #include "pci.h" 64 #include "virtio.h" 65 #include "vmd.h" 66 #include "vmm.h" 67 68 io_fn_t ioports_map[MAX_PORTS]; 69 70 int run_vm(int, int[][VM_MAX_BASE_PER_DISK], int *, 71 struct vmop_create_params *, struct vcpu_reg_state *); 72 void vm_dispatch_vmm(int, short, void *); 73 void *event_thread(void *); 74 void *vcpu_run_loop(void *); 75 int vcpu_exit(struct vm_run_params *); 76 int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *); 77 void create_memory_map(struct vm_create_params *); 78 int alloc_guest_mem(struct vm_create_params *); 79 int vmm_create_vm(struct vm_create_params *); 80 void init_emulated_hw(struct vmop_create_params *, int, 81 int[][VM_MAX_BASE_PER_DISK], int *); 82 void restore_emulated_hw(struct vm_create_params *, int, int *, 83 int[][VM_MAX_BASE_PER_DISK],int); 84 void vcpu_exit_inout(struct vm_run_params *); 85 int vcpu_exit_eptviolation(struct vm_run_params *); 86 uint8_t vcpu_exit_pci(struct vm_run_params *); 87 int vcpu_pic_intr(uint32_t, uint32_t, uint8_t); 88 int loadfile_bios(gzFile, off_t, struct vcpu_reg_state *); 89 int send_vm(int, struct vm_create_params *); 90 int dump_send_header(int); 91 int dump_vmr(int , struct vm_mem_range *); 92 int dump_mem(int, struct vm_create_params *); 93 void restore_vmr(int, struct vm_mem_range *); 94 void restore_mem(int, struct vm_create_params *); 95 int restore_vm_params(int, struct vm_create_params *); 96 void pause_vm(struct vm_create_params *); 97 void unpause_vm(struct vm_create_params *); 98 99 int translate_gva(struct vm_exit*, uint64_t, uint64_t *, int); 100 101 static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t, 102 size_t); 103 104 int con_fd; 105 struct vmd_vm *current_vm; 106 107 extern struct vmd *env; 108 109 extern char *__progname; 110 111 pthread_mutex_t threadmutex; 112 pthread_cond_t threadcond; 113 114 pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM]; 115 pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM]; 116 pthread_barrier_t vm_pause_barrier; 117 pthread_cond_t vcpu_unpause_cond[VMM_MAX_VCPUS_PER_VM]; 118 pthread_mutex_t vcpu_unpause_mtx[VMM_MAX_VCPUS_PER_VM]; 119 uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM]; 120 uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM]; 121 122 /* 123 * Represents a standard register set for an OS to be booted 124 * as a flat 64 bit address space. 125 * 126 * NOT set here are: 127 * RIP 128 * RSP 129 * GDTR BASE 130 * 131 * Specific bootloaders should clone this structure and override 132 * those fields as needed. 133 * 134 * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on 135 * features of the CPU in use. 136 */ 137 static const struct vcpu_reg_state vcpu_init_flat64 = { 138 .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2, 139 .vrs_gprs[VCPU_REGS_RIP] = 0x0, 140 .vrs_gprs[VCPU_REGS_RSP] = 0x0, 141 .vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE | CR0_PG, 142 .vrs_crs[VCPU_REGS_CR3] = PML4_PAGE, 143 .vrs_crs[VCPU_REGS_CR4] = CR4_PAE | CR4_PSE, 144 .vrs_crs[VCPU_REGS_PDPTE0] = 0ULL, 145 .vrs_crs[VCPU_REGS_PDPTE1] = 0ULL, 146 .vrs_crs[VCPU_REGS_PDPTE2] = 0ULL, 147 .vrs_crs[VCPU_REGS_PDPTE3] = 0ULL, 148 .vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0}, 149 .vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 150 .vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 151 .vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 152 .vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 153 .vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 154 .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0}, 155 .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0}, 156 .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0}, 157 .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0}, 158 .vrs_msrs[VCPU_REGS_EFER] = EFER_LME | EFER_LMA, 159 .vrs_drs[VCPU_REGS_DR0] = 0x0, 160 .vrs_drs[VCPU_REGS_DR1] = 0x0, 161 .vrs_drs[VCPU_REGS_DR2] = 0x0, 162 .vrs_drs[VCPU_REGS_DR3] = 0x0, 163 .vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0, 164 .vrs_drs[VCPU_REGS_DR7] = 0x400, 165 .vrs_msrs[VCPU_REGS_STAR] = 0ULL, 166 .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL, 167 .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL, 168 .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL, 169 .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL, 170 .vrs_msrs[VCPU_REGS_MISC_ENABLE] = 0ULL, 171 .vrs_crs[VCPU_REGS_XCR0] = XCR0_X87 172 }; 173 174 /* 175 * Represents a standard register set for an BIOS to be booted 176 * as a flat 16 bit address space. 177 */ 178 static const struct vcpu_reg_state vcpu_init_flat16 = { 179 .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2, 180 .vrs_gprs[VCPU_REGS_RIP] = 0xFFF0, 181 .vrs_gprs[VCPU_REGS_RSP] = 0x0, 182 .vrs_crs[VCPU_REGS_CR0] = 0x60000010, 183 .vrs_crs[VCPU_REGS_CR3] = 0, 184 .vrs_sregs[VCPU_REGS_CS] = { 0xF000, 0xFFFF, 0x809F, 0xF0000}, 185 .vrs_sregs[VCPU_REGS_DS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 186 .vrs_sregs[VCPU_REGS_ES] = { 0x0, 0xFFFF, 0x8093, 0x0}, 187 .vrs_sregs[VCPU_REGS_FS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 188 .vrs_sregs[VCPU_REGS_GS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 189 .vrs_sregs[VCPU_REGS_SS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 190 .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0}, 191 .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0}, 192 .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0}, 193 .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0}, 194 .vrs_msrs[VCPU_REGS_EFER] = 0ULL, 195 .vrs_drs[VCPU_REGS_DR0] = 0x0, 196 .vrs_drs[VCPU_REGS_DR1] = 0x0, 197 .vrs_drs[VCPU_REGS_DR2] = 0x0, 198 .vrs_drs[VCPU_REGS_DR3] = 0x0, 199 .vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0, 200 .vrs_drs[VCPU_REGS_DR7] = 0x400, 201 .vrs_msrs[VCPU_REGS_STAR] = 0ULL, 202 .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL, 203 .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL, 204 .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL, 205 .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL, 206 .vrs_crs[VCPU_REGS_XCR0] = XCR0_X87 207 }; 208 209 /* 210 * loadfile_bios 211 * 212 * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image 213 * directly into memory. 214 * 215 * Parameters: 216 * fp: file of a kernel file to load 217 * size: uncompressed size of the image 218 * (out) vrs: register state to set on init for this kernel 219 * 220 * Return values: 221 * 0 if successful 222 * various error codes returned from read(2) or loadelf functions 223 */ 224 int 225 loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_state *vrs) 226 { 227 off_t off; 228 229 /* Set up a "flat 16 bit" register state for BIOS */ 230 memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs)); 231 232 /* Seek to the beginning of the BIOS image */ 233 if (gzseek(fp, 0, SEEK_SET) == -1) 234 return (-1); 235 236 /* The BIOS image must end at 1M */ 237 if ((off = 1048576 - size) < 0) 238 return (-1); 239 240 /* Read BIOS image into memory */ 241 if (mread(fp, off, size) != (size_t)size) { 242 errno = EIO; 243 return (-1); 244 } 245 246 log_debug("%s: loaded BIOS image", __func__); 247 248 return (0); 249 } 250 251 /* 252 * start_vm 253 * 254 * After forking a new VM process, starts the new VM with the creation 255 * parameters supplied (in the incoming vm->vm_params field). This 256 * function performs a basic sanity check on the incoming parameters 257 * and then performs the following steps to complete the creation of the VM: 258 * 259 * 1. validates and create the new VM 260 * 2. opens the imsg control channel to the parent and drops more privilege 261 * 3. drops additional privleges by calling pledge(2) 262 * 4. loads the kernel from the disk image or file descriptor 263 * 5. runs the VM's VCPU loops. 264 * 265 * Parameters: 266 * vm: The VM data structure that is including the VM create parameters. 267 * fd: The imsg socket that is connected to the parent process. 268 * 269 * Return values: 270 * 0: success 271 * !0 : failure - typically an errno indicating the source of the failure 272 */ 273 int 274 start_vm(struct vmd_vm *vm, int fd) 275 { 276 struct vmop_create_params *vmc = &vm->vm_params; 277 struct vm_create_params *vcp = &vmc->vmc_params; 278 struct vcpu_reg_state vrs; 279 int nicfds[VMM_MAX_NICS_PER_VM]; 280 int ret; 281 gzFile fp; 282 size_t i; 283 struct vm_rwregs_params vrp; 284 struct stat sb; 285 286 /* Child */ 287 setproctitle("%s", vcp->vcp_name); 288 log_procinit(vcp->vcp_name); 289 290 if (!(vm->vm_state & VM_STATE_RECEIVED)) 291 create_memory_map(vcp); 292 293 ret = alloc_guest_mem(vcp); 294 295 if (ret) { 296 struct rlimit lim; 297 char buf[FMT_SCALED_STRSIZE]; 298 if (ret == ENOMEM && getrlimit(RLIMIT_DATA, &lim) == 0) { 299 if (fmt_scaled(lim.rlim_cur, buf) == 0) 300 fatalx("could not allocate guest memory (data " 301 "limit is %s)", buf); 302 } 303 errno = ret; 304 fatal("could not allocate guest memory"); 305 } 306 307 ret = vmm_create_vm(vcp); 308 current_vm = vm; 309 310 /* send back the kernel-generated vm id (0 on error) */ 311 if (atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) != 312 sizeof(vcp->vcp_id)) 313 fatal("failed to send created vm id to vmm process"); 314 315 if (ret) { 316 errno = ret; 317 fatal("create vmm ioctl failed - exiting"); 318 } 319 320 /* 321 * pledge in the vm processes: 322 * stdio - for malloc and basic I/O including events. 323 * recvfd - for send/recv. 324 * vmm - for the vmm ioctls and operations. 325 */ 326 if (pledge("stdio vmm recvfd", NULL) == -1) 327 fatal("pledge"); 328 329 if (vm->vm_state & VM_STATE_RECEIVED) { 330 ret = atomicio(read, vm->vm_receive_fd, &vrp, sizeof(vrp)); 331 if (ret != sizeof(vrp)) 332 fatal("received incomplete vrp - exiting"); 333 vrs = vrp.vrwp_regs; 334 } else { 335 /* 336 * Set up default "flat 64 bit" register state - RIP, 337 * RSP, and GDT info will be set in bootloader 338 */ 339 memcpy(&vrs, &vcpu_init_flat64, sizeof(vrs)); 340 341 /* Find and open kernel image */ 342 if ((fp = gzdopen(vm->vm_kernel, "r")) == NULL) 343 fatalx("failed to open kernel - exiting"); 344 345 /* Load kernel image */ 346 ret = loadfile_elf(fp, vcp, &vrs, vmc->vmc_bootdevice); 347 348 /* 349 * Try BIOS as a fallback (only if it was provided as an image 350 * with vm->vm_kernel and the file is not compressed) 351 */ 352 if (ret && errno == ENOEXEC && vm->vm_kernel != -1 && 353 gzdirect(fp) && (ret = fstat(vm->vm_kernel, &sb)) == 0) 354 ret = loadfile_bios(fp, sb.st_size, &vrs); 355 356 if (ret) 357 fatal("failed to load kernel or BIOS - exiting"); 358 359 gzclose(fp); 360 } 361 362 if (vm->vm_kernel != -1) 363 close(vm->vm_kernel); 364 365 con_fd = vm->vm_tty; 366 if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1) 367 fatal("failed to set nonblocking mode on console"); 368 369 for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) 370 nicfds[i] = vm->vm_ifs[i].vif_fd; 371 372 event_init(); 373 374 if (vm->vm_state & VM_STATE_RECEIVED) { 375 restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds, 376 vm->vm_disks, vm->vm_cdrom); 377 restore_mem(vm->vm_receive_fd, vcp); 378 if (restore_vm_params(vm->vm_receive_fd, vcp)) 379 fatal("restore vm params failed"); 380 unpause_vm(vcp); 381 } 382 383 if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1) 384 fatal("setup vm pipe"); 385 386 /* Execute the vcpu run loop(s) for this VM */ 387 ret = run_vm(vm->vm_cdrom, vm->vm_disks, nicfds, &vm->vm_params, &vrs); 388 389 /* Ensure that any in-flight data is written back */ 390 virtio_shutdown(vm); 391 392 return (ret); 393 } 394 395 /* 396 * vm_dispatch_vmm 397 * 398 * imsg callback for messages that are received from the vmm parent process. 399 */ 400 void 401 vm_dispatch_vmm(int fd, short event, void *arg) 402 { 403 struct vmd_vm *vm = arg; 404 struct vmop_result vmr; 405 struct vmop_addr_result var; 406 struct imsgev *iev = &vm->vm_iev; 407 struct imsgbuf *ibuf = &iev->ibuf; 408 struct imsg imsg; 409 ssize_t n; 410 int verbose; 411 412 if (event & EV_READ) { 413 if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN) 414 fatal("%s: imsg_read", __func__); 415 if (n == 0) 416 _exit(0); 417 } 418 419 if (event & EV_WRITE) { 420 if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN) 421 fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd); 422 if (n == 0) 423 _exit(0); 424 } 425 426 for (;;) { 427 if ((n = imsg_get(ibuf, &imsg)) == -1) 428 fatal("%s: imsg_get", __func__); 429 if (n == 0) 430 break; 431 432 #if DEBUG > 1 433 log_debug("%s: got imsg %d from %s", 434 __func__, imsg.hdr.type, 435 vm->vm_params.vmc_params.vcp_name); 436 #endif 437 438 switch (imsg.hdr.type) { 439 case IMSG_CTL_VERBOSE: 440 IMSG_SIZE_CHECK(&imsg, &verbose); 441 memcpy(&verbose, imsg.data, sizeof(verbose)); 442 log_setverbose(verbose); 443 break; 444 case IMSG_VMDOP_VM_SHUTDOWN: 445 if (vmmci_ctl(VMMCI_SHUTDOWN) == -1) 446 _exit(0); 447 break; 448 case IMSG_VMDOP_VM_REBOOT: 449 if (vmmci_ctl(VMMCI_REBOOT) == -1) 450 _exit(0); 451 break; 452 case IMSG_VMDOP_PAUSE_VM: 453 vmr.vmr_result = 0; 454 vmr.vmr_id = vm->vm_vmid; 455 pause_vm(&vm->vm_params.vmc_params); 456 imsg_compose_event(&vm->vm_iev, 457 IMSG_VMDOP_PAUSE_VM_RESPONSE, 458 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 459 sizeof(vmr)); 460 break; 461 case IMSG_VMDOP_UNPAUSE_VM: 462 vmr.vmr_result = 0; 463 vmr.vmr_id = vm->vm_vmid; 464 unpause_vm(&vm->vm_params.vmc_params); 465 imsg_compose_event(&vm->vm_iev, 466 IMSG_VMDOP_UNPAUSE_VM_RESPONSE, 467 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 468 sizeof(vmr)); 469 break; 470 case IMSG_VMDOP_SEND_VM_REQUEST: 471 vmr.vmr_id = vm->vm_vmid; 472 vmr.vmr_result = send_vm(imsg.fd, 473 &vm->vm_params.vmc_params); 474 imsg_compose_event(&vm->vm_iev, 475 IMSG_VMDOP_SEND_VM_RESPONSE, 476 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 477 sizeof(vmr)); 478 if (!vmr.vmr_result) { 479 imsg_flush(¤t_vm->vm_iev.ibuf); 480 _exit(0); 481 } 482 break; 483 case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE: 484 IMSG_SIZE_CHECK(&imsg, &var); 485 memcpy(&var, imsg.data, sizeof(var)); 486 487 log_debug("%s: received tap addr %s for nic %d", 488 vm->vm_params.vmc_params.vcp_name, 489 ether_ntoa((void *)var.var_addr), var.var_nic_idx); 490 491 vionet_set_hostmac(vm, var.var_nic_idx, var.var_addr); 492 break; 493 default: 494 fatalx("%s: got invalid imsg %d from %s", 495 __func__, imsg.hdr.type, 496 vm->vm_params.vmc_params.vcp_name); 497 } 498 imsg_free(&imsg); 499 } 500 imsg_event_add(iev); 501 } 502 503 /* 504 * vm_shutdown 505 * 506 * Tell the vmm parent process to shutdown or reboot the VM and exit. 507 */ 508 __dead void 509 vm_shutdown(unsigned int cmd) 510 { 511 switch (cmd) { 512 case VMMCI_NONE: 513 case VMMCI_SHUTDOWN: 514 (void)imsg_compose_event(¤t_vm->vm_iev, 515 IMSG_VMDOP_VM_SHUTDOWN, 0, 0, -1, NULL, 0); 516 break; 517 case VMMCI_REBOOT: 518 (void)imsg_compose_event(¤t_vm->vm_iev, 519 IMSG_VMDOP_VM_REBOOT, 0, 0, -1, NULL, 0); 520 break; 521 default: 522 fatalx("invalid vm ctl command: %d", cmd); 523 } 524 imsg_flush(¤t_vm->vm_iev.ibuf); 525 526 _exit(0); 527 } 528 529 int 530 send_vm(int fd, struct vm_create_params *vcp) 531 { 532 struct vm_rwregs_params vrp; 533 struct vm_rwvmparams_params vpp; 534 struct vmop_create_params *vmc; 535 struct vm_terminate_params vtp; 536 unsigned int flags = 0; 537 unsigned int i; 538 int ret = 0; 539 size_t sz; 540 541 if (dump_send_header(fd)) { 542 log_info("%s: failed to send vm dump header", __func__); 543 goto err; 544 } 545 546 pause_vm(vcp); 547 548 vmc = calloc(1, sizeof(struct vmop_create_params)); 549 if (vmc == NULL) { 550 log_warn("%s: calloc error geting vmc", __func__); 551 ret = -1; 552 goto err; 553 } 554 555 flags |= VMOP_CREATE_MEMORY; 556 memcpy(&vmc->vmc_params, ¤t_vm->vm_params, sizeof(struct 557 vmop_create_params)); 558 vmc->vmc_flags = flags; 559 vrp.vrwp_vm_id = vcp->vcp_id; 560 vrp.vrwp_mask = VM_RWREGS_ALL; 561 vpp.vpp_mask = VM_RWVMPARAMS_ALL; 562 vpp.vpp_vm_id = vcp->vcp_id; 563 564 sz = atomicio(vwrite, fd, vmc,sizeof(struct vmop_create_params)); 565 if (sz != sizeof(struct vmop_create_params)) { 566 ret = -1; 567 goto err; 568 } 569 570 for (i = 0; i < vcp->vcp_ncpus; i++) { 571 vrp.vrwp_vcpu_id = i; 572 if ((ret = ioctl(env->vmd_fd, VMM_IOC_READREGS, &vrp))) { 573 log_warn("%s: readregs failed", __func__); 574 goto err; 575 } 576 577 sz = atomicio(vwrite, fd, &vrp, 578 sizeof(struct vm_rwregs_params)); 579 if (sz != sizeof(struct vm_rwregs_params)) { 580 log_warn("%s: dumping registers failed", __func__); 581 ret = -1; 582 goto err; 583 } 584 } 585 586 if ((ret = i8253_dump(fd))) 587 goto err; 588 if ((ret = i8259_dump(fd))) 589 goto err; 590 if ((ret = ns8250_dump(fd))) 591 goto err; 592 if ((ret = mc146818_dump(fd))) 593 goto err; 594 if ((ret = fw_cfg_dump(fd))) 595 goto err; 596 if ((ret = pci_dump(fd))) 597 goto err; 598 if ((ret = virtio_dump(fd))) 599 goto err; 600 if ((ret = dump_mem(fd, vcp))) 601 goto err; 602 603 for (i = 0; i < vcp->vcp_ncpus; i++) { 604 vpp.vpp_vcpu_id = i; 605 if ((ret = ioctl(env->vmd_fd, VMM_IOC_READVMPARAMS, &vpp))) { 606 log_warn("%s: readvmparams failed", __func__); 607 goto err; 608 } 609 610 sz = atomicio(vwrite, fd, &vpp, 611 sizeof(struct vm_rwvmparams_params)); 612 if (sz != sizeof(struct vm_rwvmparams_params)) { 613 log_warn("%s: dumping vm params failed", __func__); 614 ret = -1; 615 goto err; 616 } 617 } 618 619 vtp.vtp_vm_id = vcp->vcp_id; 620 if (ioctl(env->vmd_fd, VMM_IOC_TERM, &vtp) == -1) { 621 log_warnx("%s: term IOC error: %d, %d", __func__, 622 errno, ENOENT); 623 } 624 err: 625 close(fd); 626 if (ret) 627 unpause_vm(vcp); 628 return ret; 629 } 630 631 int 632 dump_send_header(int fd) { 633 struct vm_dump_header vmh; 634 int i; 635 636 memcpy(&vmh.vmh_signature, VM_DUMP_SIGNATURE, 637 sizeof(vmh.vmh_signature)); 638 639 vmh.vmh_cpuids[0].code = 0x00; 640 vmh.vmh_cpuids[0].leaf = 0x00; 641 642 vmh.vmh_cpuids[1].code = 0x01; 643 vmh.vmh_cpuids[1].leaf = 0x00; 644 645 vmh.vmh_cpuids[2].code = 0x07; 646 vmh.vmh_cpuids[2].leaf = 0x00; 647 648 vmh.vmh_cpuids[3].code = 0x0d; 649 vmh.vmh_cpuids[3].leaf = 0x00; 650 651 vmh.vmh_cpuids[4].code = 0x80000001; 652 vmh.vmh_cpuids[4].leaf = 0x00; 653 654 vmh.vmh_version = VM_DUMP_VERSION; 655 656 for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) { 657 CPUID_LEAF(vmh.vmh_cpuids[i].code, 658 vmh.vmh_cpuids[i].leaf, 659 vmh.vmh_cpuids[i].a, 660 vmh.vmh_cpuids[i].b, 661 vmh.vmh_cpuids[i].c, 662 vmh.vmh_cpuids[i].d); 663 } 664 665 if (atomicio(vwrite, fd, &vmh, sizeof(vmh)) != sizeof(vmh)) 666 return (-1); 667 668 return (0); 669 } 670 671 int 672 dump_mem(int fd, struct vm_create_params *vcp) 673 { 674 unsigned int i; 675 int ret; 676 struct vm_mem_range *vmr; 677 678 for (i = 0; i < vcp->vcp_nmemranges; i++) { 679 vmr = &vcp->vcp_memranges[i]; 680 ret = dump_vmr(fd, vmr); 681 if (ret) 682 return ret; 683 } 684 return (0); 685 } 686 687 int 688 restore_vm_params(int fd, struct vm_create_params *vcp) { 689 unsigned int i; 690 struct vm_rwvmparams_params vpp; 691 692 for (i = 0; i < vcp->vcp_ncpus; i++) { 693 if (atomicio(read, fd, &vpp, sizeof(vpp)) != sizeof(vpp)) { 694 log_warn("%s: error restoring vm params", __func__); 695 return (-1); 696 } 697 vpp.vpp_vm_id = vcp->vcp_id; 698 vpp.vpp_vcpu_id = i; 699 if (ioctl(env->vmd_fd, VMM_IOC_WRITEVMPARAMS, &vpp) < 0) { 700 log_debug("%s: writing vm params failed", __func__); 701 return (-1); 702 } 703 } 704 return (0); 705 } 706 707 void 708 restore_mem(int fd, struct vm_create_params *vcp) 709 { 710 unsigned int i; 711 struct vm_mem_range *vmr; 712 713 for (i = 0; i < vcp->vcp_nmemranges; i++) { 714 vmr = &vcp->vcp_memranges[i]; 715 restore_vmr(fd, vmr); 716 } 717 } 718 719 int 720 dump_vmr(int fd, struct vm_mem_range *vmr) 721 { 722 size_t rem = vmr->vmr_size, read=0; 723 char buf[PAGE_SIZE]; 724 725 while (rem > 0) { 726 if (read_mem(vmr->vmr_gpa + read, buf, PAGE_SIZE)) { 727 log_warn("failed to read vmr"); 728 return (-1); 729 } 730 if (atomicio(vwrite, fd, buf, sizeof(buf)) != sizeof(buf)) { 731 log_warn("failed to dump vmr"); 732 return (-1); 733 } 734 rem = rem - PAGE_SIZE; 735 read = read + PAGE_SIZE; 736 } 737 return (0); 738 } 739 740 void 741 restore_vmr(int fd, struct vm_mem_range *vmr) 742 { 743 size_t rem = vmr->vmr_size, wrote=0; 744 char buf[PAGE_SIZE]; 745 746 while (rem > 0) { 747 if (atomicio(read, fd, buf, sizeof(buf)) != sizeof(buf)) 748 fatal("failed to restore vmr"); 749 if (write_mem(vmr->vmr_gpa + wrote, buf, PAGE_SIZE)) 750 fatal("failed to write vmr"); 751 rem = rem - PAGE_SIZE; 752 wrote = wrote + PAGE_SIZE; 753 } 754 } 755 756 void 757 pause_vm(struct vm_create_params *vcp) 758 { 759 unsigned int n; 760 int ret; 761 if (current_vm->vm_state & VM_STATE_PAUSED) 762 return; 763 764 current_vm->vm_state |= VM_STATE_PAUSED; 765 766 ret = pthread_barrier_init(&vm_pause_barrier, NULL, vcp->vcp_ncpus + 1); 767 if (ret) { 768 log_warnx("%s: cannot initialize pause barrier (%d)", 769 __progname, ret); 770 return; 771 } 772 773 for (n = 0; n < vcp->vcp_ncpus; n++) { 774 ret = pthread_cond_broadcast(&vcpu_run_cond[n]); 775 if (ret) { 776 log_warnx("%s: can't broadcast vcpu run cond (%d)", 777 __func__, (int)ret); 778 return; 779 } 780 } 781 ret = pthread_barrier_wait(&vm_pause_barrier); 782 if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) { 783 log_warnx("%s: could not wait on pause barrier (%d)", 784 __func__, (int)ret); 785 return; 786 } 787 788 ret = pthread_barrier_destroy(&vm_pause_barrier); 789 if (ret) { 790 log_warnx("%s: could not destroy pause barrier (%d)", 791 __progname, ret); 792 return; 793 } 794 795 i8253_stop(); 796 mc146818_stop(); 797 ns8250_stop(); 798 virtio_stop(vcp); 799 } 800 801 void 802 unpause_vm(struct vm_create_params *vcp) 803 { 804 unsigned int n; 805 int ret; 806 if (!(current_vm->vm_state & VM_STATE_PAUSED)) 807 return; 808 809 current_vm->vm_state &= ~VM_STATE_PAUSED; 810 for (n = 0; n < vcp->vcp_ncpus; n++) { 811 ret = pthread_cond_broadcast(&vcpu_unpause_cond[n]); 812 if (ret) { 813 log_warnx("%s: can't broadcast vcpu unpause cond (%d)", 814 __func__, (int)ret); 815 return; 816 } 817 } 818 819 i8253_start(); 820 mc146818_start(); 821 ns8250_start(); 822 virtio_start(vcp); 823 } 824 825 /* 826 * vcpu_reset 827 * 828 * Requests vmm(4) to reset the VCPUs in the indicated VM to 829 * the register state provided 830 * 831 * Parameters 832 * vmid: VM ID to reset 833 * vcpu_id: VCPU ID to reset 834 * vrs: the register state to initialize 835 * 836 * Return values: 837 * 0: success 838 * !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not 839 * valid) 840 */ 841 int 842 vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs) 843 { 844 struct vm_resetcpu_params vrp; 845 846 memset(&vrp, 0, sizeof(vrp)); 847 vrp.vrp_vm_id = vmid; 848 vrp.vrp_vcpu_id = vcpu_id; 849 memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state)); 850 851 log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid); 852 853 if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) == -1) 854 return (errno); 855 856 return (0); 857 } 858 859 /* 860 * create_memory_map 861 * 862 * Sets up the guest physical memory ranges that the VM can access. 863 * 864 * Parameters: 865 * vcp: VM create parameters describing the VM whose memory map 866 * is being created 867 * 868 * Return values: 869 * nothing 870 */ 871 void 872 create_memory_map(struct vm_create_params *vcp) 873 { 874 size_t len, mem_bytes, mem_mb; 875 876 mem_mb = vcp->vcp_memranges[0].vmr_size; 877 vcp->vcp_nmemranges = 0; 878 if (mem_mb < 1 || mem_mb > VMM_MAX_VM_MEM_SIZE) 879 return; 880 881 mem_bytes = mem_mb * 1024 * 1024; 882 883 /* First memory region: 0 - LOWMEM_KB (DOS low mem) */ 884 len = LOWMEM_KB * 1024; 885 vcp->vcp_memranges[0].vmr_gpa = 0x0; 886 vcp->vcp_memranges[0].vmr_size = len; 887 mem_bytes -= len; 888 889 /* 890 * Second memory region: LOWMEM_KB - 1MB. 891 * 892 * N.B. - Normally ROMs or parts of video RAM are mapped here. 893 * We have to add this region, because some systems 894 * unconditionally write to 0xb8000 (VGA RAM), and 895 * we need to make sure that vmm(4) permits accesses 896 * to it. So allocate guest memory for it. 897 */ 898 len = 0x100000 - LOWMEM_KB * 1024; 899 vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024; 900 vcp->vcp_memranges[1].vmr_size = len; 901 mem_bytes -= len; 902 903 /* Make sure that we do not place physical memory into MMIO ranges. */ 904 if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - 0x100000) 905 len = VMM_PCI_MMIO_BAR_BASE - 0x100000; 906 else 907 len = mem_bytes; 908 909 /* Third memory region: 1MB - (1MB + len) */ 910 vcp->vcp_memranges[2].vmr_gpa = 0x100000; 911 vcp->vcp_memranges[2].vmr_size = len; 912 mem_bytes -= len; 913 914 if (mem_bytes > 0) { 915 /* Fourth memory region for the remaining memory (if any) */ 916 vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1; 917 vcp->vcp_memranges[3].vmr_size = mem_bytes; 918 vcp->vcp_nmemranges = 4; 919 } else 920 vcp->vcp_nmemranges = 3; 921 } 922 923 /* 924 * alloc_guest_mem 925 * 926 * Allocates memory for the guest. 927 * Instead of doing a single allocation with one mmap(), we allocate memory 928 * separately for every range for the following reasons: 929 * - ASLR for the individual ranges 930 * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to 931 * map the single mmap'd userspace memory to the individual guest physical 932 * memory ranges, the underlying amap of the single mmap'd range would have 933 * to allocate per-page reference counters. The reason is that the 934 * individual guest physical ranges would reference the single mmap'd region 935 * only partially. However, if every guest physical range has its own 936 * corresponding mmap'd userspace allocation, there are no partial 937 * references: every guest physical range fully references an mmap'd 938 * range => no per-page reference counters have to be allocated. 939 * 940 * Return values: 941 * 0: success 942 * !0: failure - errno indicating the source of the failure 943 */ 944 int 945 alloc_guest_mem(struct vm_create_params *vcp) 946 { 947 void *p; 948 int ret; 949 size_t i, j; 950 struct vm_mem_range *vmr; 951 952 for (i = 0; i < vcp->vcp_nmemranges; i++) { 953 vmr = &vcp->vcp_memranges[i]; 954 p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE, 955 MAP_PRIVATE | MAP_ANON, -1, 0); 956 if (p == MAP_FAILED) { 957 ret = errno; 958 for (j = 0; j < i; j++) { 959 vmr = &vcp->vcp_memranges[j]; 960 munmap((void *)vmr->vmr_va, vmr->vmr_size); 961 } 962 963 return (ret); 964 } 965 966 vmr->vmr_va = (vaddr_t)p; 967 } 968 969 return (0); 970 } 971 972 /* 973 * vmm_create_vm 974 * 975 * Requests vmm(4) to create a new VM using the supplied creation 976 * parameters. This operation results in the creation of the in-kernel 977 * structures for the VM, but does not start the VM's vcpu(s). 978 * 979 * Parameters: 980 * vcp: vm_create_params struct containing the VM's desired creation 981 * configuration 982 * 983 * Return values: 984 * 0: success 985 * !0 : ioctl to vmm(4) failed 986 */ 987 int 988 vmm_create_vm(struct vm_create_params *vcp) 989 { 990 /* Sanity check arguments */ 991 if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) 992 return (EINVAL); 993 994 if (vcp->vcp_nmemranges == 0 || 995 vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES) 996 return (EINVAL); 997 998 if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) 999 return (EINVAL); 1000 1001 if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) 1002 return (EINVAL); 1003 1004 if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) == -1) 1005 return (errno); 1006 1007 return (0); 1008 } 1009 1010 /* 1011 * init_emulated_hw 1012 * 1013 * Initializes the userspace hardware emulation 1014 */ 1015 void 1016 init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom, 1017 int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps) 1018 { 1019 struct vm_create_params *vcp = &vmc->vmc_params; 1020 int i; 1021 uint64_t memlo, memhi; 1022 1023 /* Calculate memory size for NVRAM registers */ 1024 memlo = memhi = 0; 1025 if (vcp->vcp_nmemranges > 2) 1026 memlo = vcp->vcp_memranges[2].vmr_size - 15 * 0x100000; 1027 1028 if (vcp->vcp_nmemranges > 3) 1029 memhi = vcp->vcp_memranges[3].vmr_size; 1030 1031 /* Reset the IO port map */ 1032 memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS); 1033 1034 /* Init i8253 PIT */ 1035 i8253_init(vcp->vcp_id); 1036 ioports_map[TIMER_CTRL] = vcpu_exit_i8253; 1037 ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253; 1038 ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253; 1039 ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253; 1040 ioports_map[PCKBC_AUX] = vcpu_exit_i8253_misc; 1041 1042 /* Init mc146818 RTC */ 1043 mc146818_init(vcp->vcp_id, memlo, memhi); 1044 ioports_map[IO_RTC] = vcpu_exit_mc146818; 1045 ioports_map[IO_RTC + 1] = vcpu_exit_mc146818; 1046 1047 /* Init master and slave PICs */ 1048 i8259_init(); 1049 ioports_map[IO_ICU1] = vcpu_exit_i8259; 1050 ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259; 1051 ioports_map[IO_ICU2] = vcpu_exit_i8259; 1052 ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259; 1053 ioports_map[ELCR0] = vcpu_exit_elcr; 1054 ioports_map[ELCR1] = vcpu_exit_elcr; 1055 1056 /* Init ns8250 UART */ 1057 ns8250_init(con_fd, vcp->vcp_id); 1058 for (i = COM1_DATA; i <= COM1_SCR; i++) 1059 ioports_map[i] = vcpu_exit_com; 1060 1061 /* Init QEMU fw_cfg interface */ 1062 fw_cfg_init(vmc); 1063 ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg; 1064 ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg; 1065 ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma; 1066 ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma; 1067 1068 /* Initialize PCI */ 1069 for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++) 1070 ioports_map[i] = vcpu_exit_pci; 1071 1072 ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci; 1073 ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci; 1074 ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci; 1075 ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci; 1076 ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci; 1077 pci_init(); 1078 1079 /* Initialize virtio devices */ 1080 virtio_init(current_vm, child_cdrom, child_disks, child_taps); 1081 } 1082 /* 1083 * restore_emulated_hw 1084 * 1085 * Restores the userspace hardware emulation from fd 1086 */ 1087 void 1088 restore_emulated_hw(struct vm_create_params *vcp, int fd, 1089 int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom) 1090 { 1091 /* struct vm_create_params *vcp = &vmc->vmc_params; */ 1092 int i; 1093 memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS); 1094 1095 /* Init i8253 PIT */ 1096 i8253_restore(fd, vcp->vcp_id); 1097 ioports_map[TIMER_CTRL] = vcpu_exit_i8253; 1098 ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253; 1099 ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253; 1100 ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253; 1101 1102 /* Init master and slave PICs */ 1103 i8259_restore(fd); 1104 ioports_map[IO_ICU1] = vcpu_exit_i8259; 1105 ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259; 1106 ioports_map[IO_ICU2] = vcpu_exit_i8259; 1107 ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259; 1108 1109 /* Init ns8250 UART */ 1110 ns8250_restore(fd, con_fd, vcp->vcp_id); 1111 for (i = COM1_DATA; i <= COM1_SCR; i++) 1112 ioports_map[i] = vcpu_exit_com; 1113 1114 /* Init mc146818 RTC */ 1115 mc146818_restore(fd, vcp->vcp_id); 1116 ioports_map[IO_RTC] = vcpu_exit_mc146818; 1117 ioports_map[IO_RTC + 1] = vcpu_exit_mc146818; 1118 1119 /* Init QEMU fw_cfg interface */ 1120 fw_cfg_restore(fd); 1121 ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg; 1122 ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg; 1123 ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma; 1124 ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma; 1125 1126 /* Initialize PCI */ 1127 for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++) 1128 ioports_map[i] = vcpu_exit_pci; 1129 1130 ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci; 1131 ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci; 1132 ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci; 1133 ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci; 1134 ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci; 1135 pci_restore(fd); 1136 virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps); 1137 } 1138 1139 /* 1140 * run_vm 1141 * 1142 * Runs the VM whose creation parameters are specified in vcp 1143 * 1144 * Parameters: 1145 * child_cdrom: previously-opened child ISO disk file descriptor 1146 * child_disks: previously-opened child VM disk file file descriptors 1147 * child_taps: previously-opened child tap file descriptors 1148 * vmc: vmop_create_params struct containing the VM's desired creation 1149 * configuration 1150 * vrs: VCPU register state to initialize 1151 * 1152 * Return values: 1153 * 0: the VM exited normally 1154 * !0 : the VM exited abnormally or failed to start 1155 */ 1156 int 1157 run_vm(int child_cdrom, int child_disks[][VM_MAX_BASE_PER_DISK], 1158 int *child_taps, struct vmop_create_params *vmc, 1159 struct vcpu_reg_state *vrs) 1160 { 1161 struct vm_create_params *vcp = &vmc->vmc_params; 1162 struct vm_rwregs_params vregsp; 1163 uint8_t evdone = 0; 1164 size_t i; 1165 int ret; 1166 pthread_t *tid, evtid; 1167 struct vm_run_params **vrp; 1168 void *exit_status; 1169 1170 if (vcp == NULL) 1171 return (EINVAL); 1172 1173 if (child_cdrom == -1 && strlen(vcp->vcp_cdrom)) 1174 return (EINVAL); 1175 1176 if (child_disks == NULL && vcp->vcp_ndisks != 0) 1177 return (EINVAL); 1178 1179 if (child_taps == NULL && vcp->vcp_nnics != 0) 1180 return (EINVAL); 1181 1182 if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) 1183 return (EINVAL); 1184 1185 if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) 1186 return (EINVAL); 1187 1188 if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) 1189 return (EINVAL); 1190 1191 if (vcp->vcp_nmemranges == 0 || 1192 vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES) 1193 return (EINVAL); 1194 1195 tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t)); 1196 vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *)); 1197 if (tid == NULL || vrp == NULL) { 1198 log_warn("%s: memory allocation error - exiting.", 1199 __progname); 1200 return (ENOMEM); 1201 } 1202 1203 log_debug("%s: initializing hardware for vm %s", __func__, 1204 vcp->vcp_name); 1205 1206 if (!(current_vm->vm_state & VM_STATE_RECEIVED)) 1207 init_emulated_hw(vmc, child_cdrom, child_disks, child_taps); 1208 1209 ret = pthread_mutex_init(&threadmutex, NULL); 1210 if (ret) { 1211 log_warn("%s: could not initialize thread state mutex", 1212 __func__); 1213 return (ret); 1214 } 1215 ret = pthread_cond_init(&threadcond, NULL); 1216 if (ret) { 1217 log_warn("%s: could not initialize thread state " 1218 "condition variable", __func__); 1219 return (ret); 1220 } 1221 1222 mutex_lock(&threadmutex); 1223 1224 log_debug("%s: starting vcpu threads for vm %s", __func__, 1225 vcp->vcp_name); 1226 1227 /* 1228 * Create and launch one thread for each VCPU. These threads may 1229 * migrate between PCPUs over time; the need to reload CPU state 1230 * in such situations is detected and performed by vmm(4) in the 1231 * kernel. 1232 */ 1233 for (i = 0 ; i < vcp->vcp_ncpus; i++) { 1234 vrp[i] = malloc(sizeof(struct vm_run_params)); 1235 if (vrp[i] == NULL) { 1236 log_warn("%s: memory allocation error - " 1237 "exiting.", __progname); 1238 /* caller will exit, so skip freeing */ 1239 return (ENOMEM); 1240 } 1241 vrp[i]->vrp_exit = malloc(sizeof(struct vm_exit)); 1242 if (vrp[i]->vrp_exit == NULL) { 1243 log_warn("%s: memory allocation error - " 1244 "exiting.", __progname); 1245 /* caller will exit, so skip freeing */ 1246 return (ENOMEM); 1247 } 1248 vrp[i]->vrp_vm_id = vcp->vcp_id; 1249 vrp[i]->vrp_vcpu_id = i; 1250 1251 if (vcpu_reset(vcp->vcp_id, i, vrs)) { 1252 log_warnx("%s: cannot reset VCPU %zu - exiting.", 1253 __progname, i); 1254 return (EIO); 1255 } 1256 1257 /* once more because reset_cpu changes regs */ 1258 if (current_vm->vm_state & VM_STATE_RECEIVED) { 1259 vregsp.vrwp_vm_id = vcp->vcp_id; 1260 vregsp.vrwp_vcpu_id = i; 1261 vregsp.vrwp_regs = *vrs; 1262 vregsp.vrwp_mask = VM_RWREGS_ALL; 1263 if ((ret = ioctl(env->vmd_fd, VMM_IOC_WRITEREGS, 1264 &vregsp)) == -1) { 1265 log_warn("%s: writeregs failed", __func__); 1266 return (ret); 1267 } 1268 } 1269 1270 ret = pthread_cond_init(&vcpu_run_cond[i], NULL); 1271 if (ret) { 1272 log_warnx("%s: cannot initialize cond var (%d)", 1273 __progname, ret); 1274 return (ret); 1275 } 1276 1277 ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL); 1278 if (ret) { 1279 log_warnx("%s: cannot initialize mtx (%d)", 1280 __progname, ret); 1281 return (ret); 1282 } 1283 1284 ret = pthread_cond_init(&vcpu_unpause_cond[i], NULL); 1285 if (ret) { 1286 log_warnx("%s: cannot initialize unpause var (%d)", 1287 __progname, ret); 1288 return (ret); 1289 } 1290 1291 ret = pthread_mutex_init(&vcpu_unpause_mtx[i], NULL); 1292 if (ret) { 1293 log_warnx("%s: cannot initialize unpause mtx (%d)", 1294 __progname, ret); 1295 return (ret); 1296 } 1297 1298 vcpu_hlt[i] = 0; 1299 1300 /* Start each VCPU run thread at vcpu_run_loop */ 1301 ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]); 1302 if (ret) { 1303 /* caller will _exit after this return */ 1304 ret = errno; 1305 log_warn("%s: could not create vcpu thread %zu", 1306 __func__, i); 1307 return (ret); 1308 } 1309 } 1310 1311 log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name); 1312 ret = pthread_create(&evtid, NULL, event_thread, &evdone); 1313 if (ret) { 1314 errno = ret; 1315 log_warn("%s: could not create event thread", __func__); 1316 return (ret); 1317 } 1318 1319 for (;;) { 1320 ret = pthread_cond_wait(&threadcond, &threadmutex); 1321 if (ret) { 1322 log_warn("%s: waiting on thread state condition " 1323 "variable failed", __func__); 1324 return (ret); 1325 } 1326 1327 /* 1328 * Did a VCPU thread exit with an error? => return the first one 1329 */ 1330 for (i = 0; i < vcp->vcp_ncpus; i++) { 1331 if (vcpu_done[i] == 0) 1332 continue; 1333 1334 if (pthread_join(tid[i], &exit_status)) { 1335 log_warn("%s: failed to join thread %zd - " 1336 "exiting", __progname, i); 1337 return (EIO); 1338 } 1339 1340 ret = (intptr_t)exit_status; 1341 } 1342 1343 /* Did the event thread exit? => return with an error */ 1344 if (evdone) { 1345 if (pthread_join(evtid, &exit_status)) { 1346 log_warn("%s: failed to join event thread - " 1347 "exiting", __progname); 1348 return (EIO); 1349 } 1350 1351 log_warnx("%s: vm %d event thread exited " 1352 "unexpectedly", __progname, vcp->vcp_id); 1353 return (EIO); 1354 } 1355 1356 /* Did all VCPU threads exit successfully? => return */ 1357 for (i = 0; i < vcp->vcp_ncpus; i++) { 1358 if (vcpu_done[i] == 0) 1359 break; 1360 } 1361 if (i == vcp->vcp_ncpus) 1362 return (ret); 1363 1364 /* Some more threads to wait for, start over */ 1365 } 1366 1367 return (ret); 1368 } 1369 1370 void * 1371 event_thread(void *arg) 1372 { 1373 uint8_t *donep = arg; 1374 intptr_t ret; 1375 1376 ret = event_dispatch(); 1377 1378 mutex_lock(&threadmutex); 1379 *donep = 1; 1380 pthread_cond_signal(&threadcond); 1381 mutex_unlock(&threadmutex); 1382 1383 return (void *)ret; 1384 } 1385 1386 /* 1387 * vcpu_run_loop 1388 * 1389 * Runs a single VCPU until vmm(4) requires help handling an exit, 1390 * or the VM terminates. 1391 * 1392 * Parameters: 1393 * arg: vcpu_run_params for the VCPU being run by this thread 1394 * 1395 * Return values: 1396 * NULL: the VCPU shutdown properly 1397 * !NULL: error processing VCPU run, or the VCPU shutdown abnormally 1398 */ 1399 void * 1400 vcpu_run_loop(void *arg) 1401 { 1402 struct vm_run_params *vrp = (struct vm_run_params *)arg; 1403 intptr_t ret = 0; 1404 int irq; 1405 uint32_t n; 1406 1407 vrp->vrp_continue = 0; 1408 n = vrp->vrp_vcpu_id; 1409 1410 for (;;) { 1411 ret = pthread_mutex_lock(&vcpu_run_mtx[n]); 1412 1413 if (ret) { 1414 log_warnx("%s: can't lock vcpu run mtx (%d)", 1415 __func__, (int)ret); 1416 return ((void *)ret); 1417 } 1418 1419 /* If we are halted and need to pause, pause */ 1420 if (vcpu_hlt[n] && (current_vm->vm_state & VM_STATE_PAUSED)) { 1421 ret = pthread_barrier_wait(&vm_pause_barrier); 1422 if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) { 1423 log_warnx("%s: could not wait on pause barrier (%d)", 1424 __func__, (int)ret); 1425 return ((void *)ret); 1426 } 1427 1428 ret = pthread_mutex_lock(&vcpu_unpause_mtx[n]); 1429 if (ret) { 1430 log_warnx("%s: can't lock vcpu unpause mtx (%d)", 1431 __func__, (int)ret); 1432 return ((void *)ret); 1433 } 1434 1435 ret = pthread_cond_wait(&vcpu_unpause_cond[n], 1436 &vcpu_unpause_mtx[n]); 1437 if (ret) { 1438 log_warnx( 1439 "%s: can't wait on unpause cond (%d)", 1440 __func__, (int)ret); 1441 break; 1442 } 1443 ret = pthread_mutex_unlock(&vcpu_unpause_mtx[n]); 1444 if (ret) { 1445 log_warnx("%s: can't unlock unpause mtx (%d)", 1446 __func__, (int)ret); 1447 break; 1448 } 1449 } 1450 1451 /* If we are halted and not paused, wait */ 1452 if (vcpu_hlt[n]) { 1453 ret = pthread_cond_wait(&vcpu_run_cond[n], 1454 &vcpu_run_mtx[n]); 1455 1456 if (ret) { 1457 log_warnx( 1458 "%s: can't wait on cond (%d)", 1459 __func__, (int)ret); 1460 (void)pthread_mutex_unlock( 1461 &vcpu_run_mtx[n]); 1462 break; 1463 } 1464 } 1465 1466 ret = pthread_mutex_unlock(&vcpu_run_mtx[n]); 1467 1468 if (ret) { 1469 log_warnx("%s: can't unlock mutex on cond (%d)", 1470 __func__, (int)ret); 1471 break; 1472 } 1473 1474 if (vrp->vrp_irqready && i8259_is_pending()) { 1475 irq = i8259_ack(); 1476 vrp->vrp_irq = irq; 1477 } else 1478 vrp->vrp_irq = 0xFFFF; 1479 1480 /* Still more pending? */ 1481 if (i8259_is_pending()) { 1482 /* 1483 * XXX can probably avoid ioctls here by providing intr 1484 * in vrp 1485 */ 1486 if (vcpu_pic_intr(vrp->vrp_vm_id, 1487 vrp->vrp_vcpu_id, 1)) { 1488 fatal("can't set INTR"); 1489 } 1490 } else { 1491 if (vcpu_pic_intr(vrp->vrp_vm_id, 1492 vrp->vrp_vcpu_id, 0)) { 1493 fatal("can't clear INTR"); 1494 } 1495 } 1496 1497 if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) == -1) { 1498 /* If run ioctl failed, exit */ 1499 ret = errno; 1500 log_warn("%s: vm %d / vcpu %d run ioctl failed", 1501 __func__, vrp->vrp_vm_id, n); 1502 break; 1503 } 1504 1505 /* If the VM is terminating, exit normally */ 1506 if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) { 1507 ret = (intptr_t)NULL; 1508 break; 1509 } 1510 1511 if (vrp->vrp_exit_reason != VM_EXIT_NONE) { 1512 /* 1513 * vmm(4) needs help handling an exit, handle in 1514 * vcpu_exit. 1515 */ 1516 ret = vcpu_exit(vrp); 1517 if (ret) 1518 break; 1519 } 1520 } 1521 1522 mutex_lock(&threadmutex); 1523 vcpu_done[n] = 1; 1524 pthread_cond_signal(&threadcond); 1525 mutex_unlock(&threadmutex); 1526 1527 return ((void *)ret); 1528 } 1529 1530 int 1531 vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr) 1532 { 1533 struct vm_intr_params vip; 1534 1535 memset(&vip, 0, sizeof(vip)); 1536 1537 vip.vip_vm_id = vm_id; 1538 vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */ 1539 vip.vip_intr = intr; 1540 1541 if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) == -1) 1542 return (errno); 1543 1544 return (0); 1545 } 1546 1547 /* 1548 * vcpu_exit_pci 1549 * 1550 * Handle all I/O to the emulated PCI subsystem. 1551 * 1552 * Parameters: 1553 * vrp: vcpu run paramters containing guest state for this exit 1554 * 1555 * Return value: 1556 * Interrupt to inject to the guest VM, or 0xFF if no interrupt should 1557 * be injected. 1558 */ 1559 uint8_t 1560 vcpu_exit_pci(struct vm_run_params *vrp) 1561 { 1562 struct vm_exit *vei = vrp->vrp_exit; 1563 uint8_t intr; 1564 1565 intr = 0xFF; 1566 1567 switch (vei->vei.vei_port) { 1568 case PCI_MODE1_ADDRESS_REG: 1569 pci_handle_address_reg(vrp); 1570 break; 1571 case PCI_MODE1_DATA_REG: 1572 case PCI_MODE1_DATA_REG + 1: 1573 case PCI_MODE1_DATA_REG + 2: 1574 case PCI_MODE1_DATA_REG + 3: 1575 pci_handle_data_reg(vrp); 1576 break; 1577 case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END: 1578 intr = pci_handle_io(vrp); 1579 break; 1580 default: 1581 log_warnx("%s: unknown PCI register 0x%llx", 1582 __progname, (uint64_t)vei->vei.vei_port); 1583 break; 1584 } 1585 1586 return (intr); 1587 } 1588 1589 /* 1590 * vcpu_exit_inout 1591 * 1592 * Handle all I/O exits that need to be emulated in vmd. This includes the 1593 * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device. 1594 * 1595 * Parameters: 1596 * vrp: vcpu run parameters containing guest state for this exit 1597 */ 1598 void 1599 vcpu_exit_inout(struct vm_run_params *vrp) 1600 { 1601 struct vm_exit *vei = vrp->vrp_exit; 1602 uint8_t intr = 0xFF; 1603 1604 if (ioports_map[vei->vei.vei_port] != NULL) 1605 intr = ioports_map[vei->vei.vei_port](vrp); 1606 else if (vei->vei.vei_dir == VEI_DIR_IN) 1607 set_return_data(vei, 0xFFFFFFFF); 1608 1609 if (intr != 0xFF) 1610 vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr); 1611 } 1612 1613 /* 1614 * vcpu_exit_eptviolation 1615 * 1616 * handle an EPT Violation 1617 * 1618 * Parameters: 1619 * vrp: vcpu run parameters containing guest state for this exit 1620 * 1621 * Return values: 1622 * 0: no action required 1623 * EAGAIN: a protection fault occured, kill the vm. 1624 */ 1625 int 1626 vcpu_exit_eptviolation(struct vm_run_params *vrp) 1627 { 1628 struct vm_exit *ve = vrp->vrp_exit; 1629 1630 /* 1631 * vmd may be exiting to vmd to handle a pending interrupt 1632 * but last exit type may have been VMX_EXIT_EPT_VIOLATION, 1633 * check the fault_type to ensure we really are processing 1634 * a VMX_EXIT_EPT_VIOLATION. 1635 */ 1636 if (ve->vee.vee_fault_type == VEE_FAULT_PROTECT) { 1637 log_debug("%s: EPT Violation: rip=0x%llx", 1638 __progname, vrp->vrp_exit->vrs.vrs_gprs[VCPU_REGS_RIP]); 1639 return (EAGAIN); 1640 } 1641 1642 return (0); 1643 } 1644 1645 /* 1646 * vcpu_exit 1647 * 1648 * Handle a vcpu exit. This function is called when it is determined that 1649 * vmm(4) requires the assistance of vmd to support a particular guest 1650 * exit type (eg, accessing an I/O port or device). Guest state is contained 1651 * in 'vrp', and will be resent to vmm(4) on exit completion. 1652 * 1653 * Upon conclusion of handling the exit, the function determines if any 1654 * interrupts should be injected into the guest, and asserts the proper 1655 * IRQ line whose interrupt should be vectored. 1656 * 1657 * Parameters: 1658 * vrp: vcpu run parameters containing guest state for this exit 1659 * 1660 * Return values: 1661 * 0: the exit was handled successfully 1662 * 1: an error occurred (eg, unknown exit reason passed in 'vrp') 1663 */ 1664 int 1665 vcpu_exit(struct vm_run_params *vrp) 1666 { 1667 int ret; 1668 1669 switch (vrp->vrp_exit_reason) { 1670 case VMX_EXIT_INT_WINDOW: 1671 case SVM_VMEXIT_VINTR: 1672 case VMX_EXIT_CPUID: 1673 case VMX_EXIT_EXTINT: 1674 case SVM_VMEXIT_INTR: 1675 case SVM_VMEXIT_NPF: 1676 case SVM_VMEXIT_MSR: 1677 case SVM_VMEXIT_CPUID: 1678 /* 1679 * We may be exiting to vmd to handle a pending interrupt but 1680 * at the same time the last exit type may have been one of 1681 * these. In this case, there's nothing extra to be done 1682 * here (and falling through to the default case below results 1683 * in more vmd log spam). 1684 */ 1685 break; 1686 case VMX_EXIT_EPT_VIOLATION: 1687 ret = vcpu_exit_eptviolation(vrp); 1688 if (ret) 1689 return (ret); 1690 1691 break; 1692 case VMX_EXIT_IO: 1693 case SVM_VMEXIT_IOIO: 1694 vcpu_exit_inout(vrp); 1695 break; 1696 case VMX_EXIT_HLT: 1697 case SVM_VMEXIT_HLT: 1698 ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]); 1699 if (ret) { 1700 log_warnx("%s: can't lock vcpu mutex (%d)", 1701 __func__, ret); 1702 return (ret); 1703 } 1704 vcpu_hlt[vrp->vrp_vcpu_id] = 1; 1705 ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]); 1706 if (ret) { 1707 log_warnx("%s: can't unlock vcpu mutex (%d)", 1708 __func__, ret); 1709 return (ret); 1710 } 1711 break; 1712 case VMX_EXIT_TRIPLE_FAULT: 1713 case SVM_VMEXIT_SHUTDOWN: 1714 /* reset VM */ 1715 return (EAGAIN); 1716 default: 1717 log_debug("%s: unknown exit reason 0x%x", 1718 __progname, vrp->vrp_exit_reason); 1719 } 1720 1721 vrp->vrp_continue = 1; 1722 1723 return (0); 1724 } 1725 1726 /* 1727 * find_gpa_range 1728 * 1729 * Search for a contiguous guest physical mem range. 1730 * 1731 * Parameters: 1732 * vcp: VM create parameters that contain the memory map to search in 1733 * gpa: the starting guest physical address 1734 * len: the length of the memory range 1735 * 1736 * Return values: 1737 * NULL: on failure if there is no memory range as described by the parameters 1738 * Pointer to vm_mem_range that contains the start of the range otherwise. 1739 */ 1740 static struct vm_mem_range * 1741 find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len) 1742 { 1743 size_t i, n; 1744 struct vm_mem_range *vmr; 1745 1746 /* Find the first vm_mem_range that contains gpa */ 1747 for (i = 0; i < vcp->vcp_nmemranges; i++) { 1748 vmr = &vcp->vcp_memranges[i]; 1749 if (vmr->vmr_gpa + vmr->vmr_size >= gpa) 1750 break; 1751 } 1752 1753 /* No range found. */ 1754 if (i == vcp->vcp_nmemranges) 1755 return (NULL); 1756 1757 /* 1758 * vmr may cover the range [gpa, gpa + len) only partly. Make 1759 * sure that the following vm_mem_ranges are contiguous and 1760 * cover the rest. 1761 */ 1762 n = vmr->vmr_size - (gpa - vmr->vmr_gpa); 1763 if (len < n) 1764 len = 0; 1765 else 1766 len -= n; 1767 gpa = vmr->vmr_gpa + vmr->vmr_size; 1768 for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) { 1769 vmr = &vcp->vcp_memranges[i]; 1770 if (gpa != vmr->vmr_gpa) 1771 return (NULL); 1772 if (len <= vmr->vmr_size) 1773 len = 0; 1774 else 1775 len -= vmr->vmr_size; 1776 1777 gpa = vmr->vmr_gpa + vmr->vmr_size; 1778 } 1779 1780 if (len != 0) 1781 return (NULL); 1782 1783 return (vmr); 1784 } 1785 1786 /* 1787 * write_mem 1788 * 1789 * Copies data from 'buf' into the guest VM's memory at paddr 'dst'. 1790 * 1791 * Parameters: 1792 * dst: the destination paddr_t in the guest VM 1793 * buf: data to copy (or NULL to zero the data) 1794 * len: number of bytes to copy 1795 * 1796 * Return values: 1797 * 0: success 1798 * EINVAL: if the guest physical memory range [dst, dst + len) does not 1799 * exist in the guest. 1800 */ 1801 int 1802 write_mem(paddr_t dst, const void *buf, size_t len) 1803 { 1804 const char *from = buf; 1805 char *to; 1806 size_t n, off; 1807 struct vm_mem_range *vmr; 1808 1809 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, dst, len); 1810 if (vmr == NULL) { 1811 errno = EINVAL; 1812 log_warn("%s: failed - invalid memory range dst = 0x%lx, " 1813 "len = 0x%zx", __func__, dst, len); 1814 return (EINVAL); 1815 } 1816 1817 off = dst - vmr->vmr_gpa; 1818 while (len != 0) { 1819 n = vmr->vmr_size - off; 1820 if (len < n) 1821 n = len; 1822 1823 to = (char *)vmr->vmr_va + off; 1824 if (buf == NULL) 1825 memset(to, 0, n); 1826 else { 1827 memcpy(to, from, n); 1828 from += n; 1829 } 1830 len -= n; 1831 off = 0; 1832 vmr++; 1833 } 1834 1835 return (0); 1836 } 1837 1838 /* 1839 * read_mem 1840 * 1841 * Reads memory at guest paddr 'src' into 'buf'. 1842 * 1843 * Parameters: 1844 * src: the source paddr_t in the guest VM to read from. 1845 * buf: destination (local) buffer 1846 * len: number of bytes to read 1847 * 1848 * Return values: 1849 * 0: success 1850 * EINVAL: if the guest physical memory range [dst, dst + len) does not 1851 * exist in the guest. 1852 */ 1853 int 1854 read_mem(paddr_t src, void *buf, size_t len) 1855 { 1856 char *from, *to = buf; 1857 size_t n, off; 1858 struct vm_mem_range *vmr; 1859 1860 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, src, len); 1861 if (vmr == NULL) { 1862 errno = EINVAL; 1863 log_warn("%s: failed - invalid memory range src = 0x%lx, " 1864 "len = 0x%zx", __func__, src, len); 1865 return (EINVAL); 1866 } 1867 1868 off = src - vmr->vmr_gpa; 1869 while (len != 0) { 1870 n = vmr->vmr_size - off; 1871 if (len < n) 1872 n = len; 1873 1874 from = (char *)vmr->vmr_va + off; 1875 memcpy(to, from, n); 1876 1877 to += n; 1878 len -= n; 1879 off = 0; 1880 vmr++; 1881 } 1882 1883 return (0); 1884 } 1885 1886 /* 1887 * vcpu_assert_pic_irq 1888 * 1889 * Injects the specified IRQ on the supplied vcpu/vm 1890 * 1891 * Parameters: 1892 * vm_id: VM ID to inject to 1893 * vcpu_id: VCPU ID to inject to 1894 * irq: IRQ to inject 1895 */ 1896 void 1897 vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) 1898 { 1899 int ret; 1900 1901 i8259_assert_irq(irq); 1902 1903 if (i8259_is_pending()) { 1904 if (vcpu_pic_intr(vm_id, vcpu_id, 1)) 1905 fatalx("%s: can't assert INTR", __func__); 1906 1907 ret = pthread_mutex_lock(&vcpu_run_mtx[vcpu_id]); 1908 if (ret) 1909 fatalx("%s: can't lock vcpu mtx (%d)", __func__, ret); 1910 1911 vcpu_hlt[vcpu_id] = 0; 1912 ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]); 1913 if (ret) 1914 fatalx("%s: can't signal (%d)", __func__, ret); 1915 ret = pthread_mutex_unlock(&vcpu_run_mtx[vcpu_id]); 1916 if (ret) 1917 fatalx("%s: can't unlock vcpu mtx (%d)", __func__, ret); 1918 } 1919 } 1920 1921 /* 1922 * vcpu_deassert_pic_irq 1923 * 1924 * Clears the specified IRQ on the supplied vcpu/vm 1925 * 1926 * Parameters: 1927 * vm_id: VM ID to clear in 1928 * vcpu_id: VCPU ID to clear in 1929 * irq: IRQ to clear 1930 */ 1931 void 1932 vcpu_deassert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) 1933 { 1934 i8259_deassert_irq(irq); 1935 1936 if (!i8259_is_pending()) { 1937 if (vcpu_pic_intr(vm_id, vcpu_id, 0)) 1938 fatalx("%s: can't deassert INTR for vm_id %d, " 1939 "vcpu_id %d", __func__, vm_id, vcpu_id); 1940 } 1941 } 1942 1943 /* 1944 * fd_hasdata 1945 * 1946 * Determines if data can be read from a file descriptor. 1947 * 1948 * Parameters: 1949 * fd: the fd to check 1950 * 1951 * Return values: 1952 * 1 if data can be read from an fd, or 0 otherwise. 1953 */ 1954 int 1955 fd_hasdata(int fd) 1956 { 1957 struct pollfd pfd[1]; 1958 int nready, hasdata = 0; 1959 1960 pfd[0].fd = fd; 1961 pfd[0].events = POLLIN; 1962 nready = poll(pfd, 1, 0); 1963 if (nready == -1) 1964 log_warn("checking file descriptor for data failed"); 1965 else if (nready == 1 && pfd[0].revents & POLLIN) 1966 hasdata = 1; 1967 return (hasdata); 1968 } 1969 1970 /* 1971 * mutex_lock 1972 * 1973 * Wrapper function for pthread_mutex_lock that does error checking and that 1974 * exits on failure 1975 */ 1976 void 1977 mutex_lock(pthread_mutex_t *m) 1978 { 1979 int ret; 1980 1981 ret = pthread_mutex_lock(m); 1982 if (ret) { 1983 errno = ret; 1984 fatal("could not acquire mutex"); 1985 } 1986 } 1987 1988 /* 1989 * mutex_unlock 1990 * 1991 * Wrapper function for pthread_mutex_unlock that does error checking and that 1992 * exits on failure 1993 */ 1994 void 1995 mutex_unlock(pthread_mutex_t *m) 1996 { 1997 int ret; 1998 1999 ret = pthread_mutex_unlock(m); 2000 if (ret) { 2001 errno = ret; 2002 fatal("could not release mutex"); 2003 } 2004 } 2005 2006 /* 2007 * set_return_data 2008 * 2009 * Utility function for manipulating register data in vm exit info structs. This 2010 * function ensures that the data is copied to the vei->vei.vei_data field with 2011 * the proper size for the operation being performed. 2012 * 2013 * Parameters: 2014 * vei: exit information 2015 * data: return data 2016 */ 2017 void 2018 set_return_data(struct vm_exit *vei, uint32_t data) 2019 { 2020 switch (vei->vei.vei_size) { 2021 case 1: 2022 vei->vei.vei_data &= ~0xFF; 2023 vei->vei.vei_data |= (uint8_t)data; 2024 break; 2025 case 2: 2026 vei->vei.vei_data &= ~0xFFFF; 2027 vei->vei.vei_data |= (uint16_t)data; 2028 break; 2029 case 4: 2030 vei->vei.vei_data = data; 2031 break; 2032 } 2033 } 2034 2035 /* 2036 * get_input_data 2037 * 2038 * Utility function for manipulating register data in vm exit info 2039 * structs. This function ensures that the data is copied from the 2040 * vei->vei.vei_data field with the proper size for the operation being 2041 * performed. 2042 * 2043 * Parameters: 2044 * vei: exit information 2045 * data: location to store the result 2046 */ 2047 void 2048 get_input_data(struct vm_exit *vei, uint32_t *data) 2049 { 2050 switch (vei->vei.vei_size) { 2051 case 1: 2052 *data &= 0xFFFFFF00; 2053 *data |= (uint8_t)vei->vei.vei_data; 2054 break; 2055 case 2: 2056 *data &= 0xFFFF0000; 2057 *data |= (uint16_t)vei->vei.vei_data; 2058 break; 2059 case 4: 2060 *data = vei->vei.vei_data; 2061 break; 2062 default: 2063 log_warnx("%s: invalid i/o size %d", __func__, 2064 vei->vei.vei_size); 2065 } 2066 2067 } 2068 2069 /* 2070 * translate_gva 2071 * 2072 * Translates a guest virtual address to a guest physical address by walking 2073 * the currently active page table (if needed). 2074 * 2075 * Note - this function can possibly alter the supplied VCPU state. 2076 * Specifically, it may inject exceptions depending on the current VCPU 2077 * configuration, and may alter %cr2 on #PF. Consequently, this function 2078 * should only be used as part of instruction emulation. 2079 * 2080 * Parameters: 2081 * exit: The VCPU this translation should be performed for (guest MMU settings 2082 * are gathered from this VCPU) 2083 * va: virtual address to translate 2084 * pa: pointer to paddr_t variable that will receive the translated physical 2085 * address. 'pa' is unchanged on error. 2086 * mode: one of PROT_READ, PROT_WRITE, PROT_EXEC indicating the mode in which 2087 * the address should be translated 2088 * 2089 * Return values: 2090 * 0: the address was successfully translated - 'pa' contains the physical 2091 * address currently mapped by 'va'. 2092 * EFAULT: the PTE for 'VA' is unmapped. A #PF will be injected in this case 2093 * and %cr2 set in the vcpu structure. 2094 * EINVAL: an error occurred reading paging table structures 2095 */ 2096 int 2097 translate_gva(struct vm_exit* exit, uint64_t va, uint64_t* pa, int mode) 2098 { 2099 int level, shift, pdidx; 2100 uint64_t pte, pt_paddr, pte_paddr, mask, low_mask, high_mask; 2101 uint64_t shift_width, pte_size; 2102 struct vcpu_reg_state *vrs; 2103 2104 vrs = &exit->vrs; 2105 2106 if (!pa) 2107 return (EINVAL); 2108 2109 if (!(vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PG)) { 2110 log_debug("%s: unpaged, va=pa=0x%llx", __func__, va); 2111 *pa = va; 2112 return (0); 2113 } 2114 2115 pt_paddr = vrs->vrs_crs[VCPU_REGS_CR3]; 2116 2117 log_debug("%s: guest %%cr0=0x%llx, %%cr3=0x%llx", __func__, 2118 vrs->vrs_crs[VCPU_REGS_CR0], vrs->vrs_crs[VCPU_REGS_CR3]); 2119 2120 if (vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PE) { 2121 if (vrs->vrs_crs[VCPU_REGS_CR4] & CR4_PAE) { 2122 pte_size = sizeof(uint64_t); 2123 shift_width = 9; 2124 2125 if (vrs->vrs_msrs[VCPU_REGS_EFER] & EFER_LMA) { 2126 /* 4 level paging */ 2127 level = 4; 2128 mask = L4_MASK; 2129 shift = L4_SHIFT; 2130 } else { 2131 /* 32 bit with PAE paging */ 2132 level = 3; 2133 mask = L3_MASK; 2134 shift = L3_SHIFT; 2135 } 2136 } else { 2137 /* 32 bit paging */ 2138 level = 2; 2139 shift_width = 10; 2140 mask = 0xFFC00000; 2141 shift = 22; 2142 pte_size = sizeof(uint32_t); 2143 } 2144 } else 2145 return (EINVAL); 2146 2147 /* XXX: Check for R bit in segment selector and set A bit */ 2148 2149 for (;level > 0; level--) { 2150 pdidx = (va & mask) >> shift; 2151 pte_paddr = (pt_paddr) + (pdidx * pte_size); 2152 2153 log_debug("%s: read pte level %d @ GPA 0x%llx", __func__, 2154 level, pte_paddr); 2155 if (read_mem(pte_paddr, &pte, pte_size)) { 2156 log_warn("%s: failed to read pte", __func__); 2157 return (EFAULT); 2158 } 2159 2160 log_debug("%s: PTE @ 0x%llx = 0x%llx", __func__, pte_paddr, 2161 pte); 2162 2163 /* XXX: Set CR2 */ 2164 if (!(pte & PG_V)) 2165 return (EFAULT); 2166 2167 /* XXX: Check for SMAP */ 2168 if ((mode == PROT_WRITE) && !(pte & PG_RW)) 2169 return (EPERM); 2170 2171 if ((exit->cpl > 0) && !(pte & PG_u)) 2172 return (EPERM); 2173 2174 pte = pte | PG_U; 2175 if (mode == PROT_WRITE) 2176 pte = pte | PG_M; 2177 if (write_mem(pte_paddr, &pte, pte_size)) { 2178 log_warn("%s: failed to write back flags to pte", 2179 __func__); 2180 return (EIO); 2181 } 2182 2183 /* XXX: EINVAL if in 32bit and PG_PS is 1 but CR4.PSE is 0 */ 2184 if (pte & PG_PS) 2185 break; 2186 2187 if (level > 1) { 2188 pt_paddr = pte & PG_FRAME; 2189 shift -= shift_width; 2190 mask = mask >> shift_width; 2191 } 2192 } 2193 2194 low_mask = (1 << shift) - 1; 2195 high_mask = (((uint64_t)1ULL << ((pte_size * 8) - 1)) - 1) ^ low_mask; 2196 *pa = (pte & high_mask) | (va & low_mask); 2197 2198 log_debug("%s: final GPA for GVA 0x%llx = 0x%llx\n", __func__, va, *pa); 2199 2200 return (0); 2201 } 2202 2203 /* 2204 * vm_pipe_init 2205 * 2206 * Initialize a vm_dev_pipe, setting up its file descriptors and its 2207 * event structure with the given callback. 2208 * 2209 * Parameters: 2210 * p: pointer to vm_dev_pipe struct to initizlize 2211 * cb: callback to use for READ events on the read end of the pipe 2212 */ 2213 void 2214 vm_pipe_init(struct vm_dev_pipe *p, void (*cb)(int, short, void *)) 2215 { 2216 int ret; 2217 int fds[2]; 2218 2219 memset(p, 0, sizeof(struct vm_dev_pipe)); 2220 2221 ret = pipe(fds); 2222 if (ret) 2223 fatal("failed to create vm_dev_pipe pipe"); 2224 2225 p->read = fds[0]; 2226 p->write = fds[1]; 2227 2228 event_set(&p->read_ev, p->read, EV_READ | EV_PERSIST, cb, NULL); 2229 } 2230 2231 /* 2232 * vm_pipe_send 2233 * 2234 * Send a message to an emulated device vie the provided vm_dev_pipe. 2235 * 2236 * Parameters: 2237 * p: pointer to initialized vm_dev_pipe 2238 * msg: message to send in the channel 2239 */ 2240 void 2241 vm_pipe_send(struct vm_dev_pipe *p, enum pipe_msg_type msg) 2242 { 2243 size_t n; 2244 n = write(p->write, &msg, sizeof(msg)); 2245 if (n != sizeof(msg)) 2246 fatal("failed to write to device pipe"); 2247 } 2248 2249 /* 2250 * vm_pipe_recv 2251 * 2252 * Receive a message for an emulated device via the provided vm_dev_pipe. 2253 * Returns the message value, otherwise will exit on failure. 2254 * 2255 * Parameters: 2256 * p: pointer to initialized vm_dev_pipe 2257 * 2258 * Return values: 2259 * a value of enum pipe_msg_type or fatal exit on read(2) error 2260 */ 2261 enum pipe_msg_type 2262 vm_pipe_recv(struct vm_dev_pipe *p) 2263 { 2264 size_t n; 2265 enum pipe_msg_type msg; 2266 n = read(p->read, &msg, sizeof(msg)); 2267 if (n != sizeof(msg)) 2268 fatal("failed to read from device pipe"); 2269 2270 return msg; 2271 } 2272