1 /* $OpenBSD: vm.c,v 1.62 2021/04/05 18:09:48 dv Exp $ */ 2 3 /* 4 * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/types.h> 20 #include <sys/ioctl.h> 21 #include <sys/queue.h> 22 #include <sys/wait.h> 23 #include <sys/uio.h> 24 #include <sys/stat.h> 25 #include <sys/socket.h> 26 #include <sys/time.h> 27 #include <sys/mman.h> 28 29 #include <dev/ic/i8253reg.h> 30 #include <dev/isa/isareg.h> 31 #include <dev/pci/pcireg.h> 32 33 #include <machine/param.h> 34 #include <machine/psl.h> 35 #include <machine/pte.h> 36 #include <machine/specialreg.h> 37 #include <machine/vmmvar.h> 38 39 #include <net/if.h> 40 41 #include <errno.h> 42 #include <event.h> 43 #include <fcntl.h> 44 #include <imsg.h> 45 #include <limits.h> 46 #include <poll.h> 47 #include <pthread.h> 48 #include <stddef.h> 49 #include <stdio.h> 50 #include <stdlib.h> 51 #include <string.h> 52 #include <unistd.h> 53 #include <util.h> 54 55 #include "vmd.h" 56 #include "vmm.h" 57 #include "loadfile.h" 58 #include "pci.h" 59 #include "virtio.h" 60 #include "proc.h" 61 #include "i8253.h" 62 #include "i8259.h" 63 #include "ns8250.h" 64 #include "mc146818.h" 65 #include "fw_cfg.h" 66 #include "atomicio.h" 67 68 io_fn_t ioports_map[MAX_PORTS]; 69 70 int run_vm(int, int[][VM_MAX_BASE_PER_DISK], int *, 71 struct vmop_create_params *, struct vcpu_reg_state *); 72 void vm_dispatch_vmm(int, short, void *); 73 void *event_thread(void *); 74 void *vcpu_run_loop(void *); 75 int vcpu_exit(struct vm_run_params *); 76 int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *); 77 void create_memory_map(struct vm_create_params *); 78 int alloc_guest_mem(struct vm_create_params *); 79 int vmm_create_vm(struct vm_create_params *); 80 void init_emulated_hw(struct vmop_create_params *, int, 81 int[][VM_MAX_BASE_PER_DISK], int *); 82 void restore_emulated_hw(struct vm_create_params *, int, int *, 83 int[][VM_MAX_BASE_PER_DISK],int); 84 void vcpu_exit_inout(struct vm_run_params *); 85 int vcpu_exit_eptviolation(struct vm_run_params *); 86 uint8_t vcpu_exit_pci(struct vm_run_params *); 87 int vcpu_pic_intr(uint32_t, uint32_t, uint8_t); 88 int loadfile_bios(gzFile, off_t, struct vcpu_reg_state *); 89 int send_vm(int, struct vm_create_params *); 90 int dump_send_header(int); 91 int dump_vmr(int , struct vm_mem_range *); 92 int dump_mem(int, struct vm_create_params *); 93 void restore_vmr(int, struct vm_mem_range *); 94 void restore_mem(int, struct vm_create_params *); 95 int restore_vm_params(int, struct vm_create_params *); 96 void pause_vm(struct vm_create_params *); 97 void unpause_vm(struct vm_create_params *); 98 99 int translate_gva(struct vm_exit*, uint64_t, uint64_t *, int); 100 101 static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t, 102 size_t); 103 104 int con_fd; 105 struct vmd_vm *current_vm; 106 107 extern struct vmd *env; 108 109 extern char *__progname; 110 111 pthread_mutex_t threadmutex; 112 pthread_cond_t threadcond; 113 114 pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM]; 115 pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM]; 116 pthread_barrier_t vm_pause_barrier; 117 pthread_cond_t vcpu_unpause_cond[VMM_MAX_VCPUS_PER_VM]; 118 pthread_mutex_t vcpu_unpause_mtx[VMM_MAX_VCPUS_PER_VM]; 119 uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM]; 120 uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM]; 121 122 /* 123 * Represents a standard register set for an OS to be booted 124 * as a flat 64 bit address space. 125 * 126 * NOT set here are: 127 * RIP 128 * RSP 129 * GDTR BASE 130 * 131 * Specific bootloaders should clone this structure and override 132 * those fields as needed. 133 * 134 * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on 135 * features of the CPU in use. 136 */ 137 static const struct vcpu_reg_state vcpu_init_flat64 = { 138 .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2, 139 .vrs_gprs[VCPU_REGS_RIP] = 0x0, 140 .vrs_gprs[VCPU_REGS_RSP] = 0x0, 141 .vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE | CR0_PG, 142 .vrs_crs[VCPU_REGS_CR3] = PML4_PAGE, 143 .vrs_crs[VCPU_REGS_CR4] = CR4_PAE | CR4_PSE, 144 .vrs_crs[VCPU_REGS_PDPTE0] = 0ULL, 145 .vrs_crs[VCPU_REGS_PDPTE1] = 0ULL, 146 .vrs_crs[VCPU_REGS_PDPTE2] = 0ULL, 147 .vrs_crs[VCPU_REGS_PDPTE3] = 0ULL, 148 .vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0}, 149 .vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 150 .vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 151 .vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 152 .vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 153 .vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0}, 154 .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0}, 155 .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0}, 156 .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0}, 157 .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0}, 158 .vrs_msrs[VCPU_REGS_EFER] = EFER_LME | EFER_LMA, 159 .vrs_drs[VCPU_REGS_DR0] = 0x0, 160 .vrs_drs[VCPU_REGS_DR1] = 0x0, 161 .vrs_drs[VCPU_REGS_DR2] = 0x0, 162 .vrs_drs[VCPU_REGS_DR3] = 0x0, 163 .vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0, 164 .vrs_drs[VCPU_REGS_DR7] = 0x400, 165 .vrs_msrs[VCPU_REGS_STAR] = 0ULL, 166 .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL, 167 .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL, 168 .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL, 169 .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL, 170 .vrs_msrs[VCPU_REGS_MISC_ENABLE] = 0ULL, 171 .vrs_crs[VCPU_REGS_XCR0] = XCR0_X87 172 }; 173 174 /* 175 * Represents a standard register set for an BIOS to be booted 176 * as a flat 16 bit address space. 177 */ 178 static const struct vcpu_reg_state vcpu_init_flat16 = { 179 .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2, 180 .vrs_gprs[VCPU_REGS_RIP] = 0xFFF0, 181 .vrs_gprs[VCPU_REGS_RSP] = 0x0, 182 .vrs_crs[VCPU_REGS_CR0] = 0x60000010, 183 .vrs_crs[VCPU_REGS_CR3] = 0, 184 .vrs_sregs[VCPU_REGS_CS] = { 0xF000, 0xFFFF, 0x809F, 0xF0000}, 185 .vrs_sregs[VCPU_REGS_DS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 186 .vrs_sregs[VCPU_REGS_ES] = { 0x0, 0xFFFF, 0x8093, 0x0}, 187 .vrs_sregs[VCPU_REGS_FS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 188 .vrs_sregs[VCPU_REGS_GS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 189 .vrs_sregs[VCPU_REGS_SS] = { 0x0, 0xFFFF, 0x8093, 0x0}, 190 .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0}, 191 .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0}, 192 .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0}, 193 .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0}, 194 .vrs_msrs[VCPU_REGS_EFER] = 0ULL, 195 .vrs_drs[VCPU_REGS_DR0] = 0x0, 196 .vrs_drs[VCPU_REGS_DR1] = 0x0, 197 .vrs_drs[VCPU_REGS_DR2] = 0x0, 198 .vrs_drs[VCPU_REGS_DR3] = 0x0, 199 .vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0, 200 .vrs_drs[VCPU_REGS_DR7] = 0x400, 201 .vrs_msrs[VCPU_REGS_STAR] = 0ULL, 202 .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL, 203 .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL, 204 .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL, 205 .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL, 206 .vrs_crs[VCPU_REGS_XCR0] = XCR0_X87 207 }; 208 209 /* 210 * loadfile_bios 211 * 212 * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image 213 * directly into memory. 214 * 215 * Parameters: 216 * fp: file of a kernel file to load 217 * size: uncompressed size of the image 218 * (out) vrs: register state to set on init for this kernel 219 * 220 * Return values: 221 * 0 if successful 222 * various error codes returned from read(2) or loadelf functions 223 */ 224 int 225 loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_state *vrs) 226 { 227 off_t off; 228 229 /* Set up a "flat 16 bit" register state for BIOS */ 230 memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs)); 231 232 /* Seek to the beginning of the BIOS image */ 233 if (gzseek(fp, 0, SEEK_SET) == -1) 234 return (-1); 235 236 /* The BIOS image must end at 1M */ 237 if ((off = 1048576 - size) < 0) 238 return (-1); 239 240 /* Read BIOS image into memory */ 241 if (mread(fp, off, size) != (size_t)size) { 242 errno = EIO; 243 return (-1); 244 } 245 246 log_debug("%s: loaded BIOS image", __func__); 247 248 return (0); 249 } 250 251 /* 252 * start_vm 253 * 254 * After forking a new VM process, starts the new VM with the creation 255 * parameters supplied (in the incoming vm->vm_params field). This 256 * function performs a basic sanity check on the incoming parameters 257 * and then performs the following steps to complete the creation of the VM: 258 * 259 * 1. validates and create the new VM 260 * 2. opens the imsg control channel to the parent and drops more privilege 261 * 3. drops additional privleges by calling pledge(2) 262 * 4. loads the kernel from the disk image or file descriptor 263 * 5. runs the VM's VCPU loops. 264 * 265 * Parameters: 266 * vm: The VM data structure that is including the VM create parameters. 267 * fd: The imsg socket that is connected to the parent process. 268 * 269 * Return values: 270 * 0: success 271 * !0 : failure - typically an errno indicating the source of the failure 272 */ 273 int 274 start_vm(struct vmd_vm *vm, int fd) 275 { 276 struct vmop_create_params *vmc = &vm->vm_params; 277 struct vm_create_params *vcp = &vmc->vmc_params; 278 struct vcpu_reg_state vrs; 279 int nicfds[VMM_MAX_NICS_PER_VM]; 280 int ret; 281 gzFile fp; 282 size_t i; 283 struct vm_rwregs_params vrp; 284 struct stat sb; 285 286 /* Child */ 287 setproctitle("%s", vcp->vcp_name); 288 log_procinit(vcp->vcp_name); 289 290 if (!(vm->vm_state & VM_STATE_RECEIVED)) 291 create_memory_map(vcp); 292 293 ret = alloc_guest_mem(vcp); 294 295 if (ret) { 296 errno = ret; 297 fatal("could not allocate guest memory - exiting"); 298 } 299 300 ret = vmm_create_vm(vcp); 301 current_vm = vm; 302 303 /* send back the kernel-generated vm id (0 on error) */ 304 if (write(fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) != 305 sizeof(vcp->vcp_id)) 306 fatal("write vcp id"); 307 308 if (ret) { 309 errno = ret; 310 fatal("create vmm ioctl failed - exiting"); 311 } 312 313 /* 314 * pledge in the vm processes: 315 * stdio - for malloc and basic I/O including events. 316 * recvfd - for send/recv. 317 * vmm - for the vmm ioctls and operations. 318 */ 319 if (pledge("stdio vmm recvfd", NULL) == -1) 320 fatal("pledge"); 321 322 if (vm->vm_state & VM_STATE_RECEIVED) { 323 ret = read(vm->vm_receive_fd, &vrp, sizeof(vrp)); 324 if (ret != sizeof(vrp)) { 325 fatal("received incomplete vrp - exiting"); 326 } 327 vrs = vrp.vrwp_regs; 328 } else { 329 /* 330 * Set up default "flat 64 bit" register state - RIP, 331 * RSP, and GDT info will be set in bootloader 332 */ 333 memcpy(&vrs, &vcpu_init_flat64, sizeof(vrs)); 334 335 /* Find and open kernel image */ 336 if ((fp = gzdopen(vm->vm_kernel, "r")) == NULL) 337 fatalx("failed to open kernel - exiting"); 338 339 /* Load kernel image */ 340 ret = loadfile_elf(fp, vcp, &vrs); 341 342 /* 343 * Try BIOS as a fallback (only if it was provided as an image 344 * with vm->vm_kernel and the file is not compressed) 345 */ 346 if (ret && errno == ENOEXEC && vm->vm_kernel != -1 && 347 gzdirect(fp) && (ret = fstat(vm->vm_kernel, &sb)) == 0) 348 ret = loadfile_bios(fp, sb.st_size, &vrs); 349 350 if (ret) 351 fatal("failed to load kernel or BIOS - exiting"); 352 353 gzclose(fp); 354 } 355 356 if (vm->vm_kernel != -1) 357 close(vm->vm_kernel); 358 359 con_fd = vm->vm_tty; 360 if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1) 361 fatal("failed to set nonblocking mode on console"); 362 363 for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) 364 nicfds[i] = vm->vm_ifs[i].vif_fd; 365 366 event_init(); 367 368 if (vm->vm_state & VM_STATE_RECEIVED) { 369 restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds, 370 vm->vm_disks, vm->vm_cdrom); 371 restore_mem(vm->vm_receive_fd, vcp); 372 if (restore_vm_params(vm->vm_receive_fd, vcp)) 373 fatal("restore vm params failed"); 374 unpause_vm(vcp); 375 } 376 377 if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1) 378 fatal("setup vm pipe"); 379 380 /* Execute the vcpu run loop(s) for this VM */ 381 ret = run_vm(vm->vm_cdrom, vm->vm_disks, nicfds, &vm->vm_params, &vrs); 382 383 /* Ensure that any in-flight data is written back */ 384 virtio_shutdown(vm); 385 386 return (ret); 387 } 388 389 /* 390 * vm_dispatch_vmm 391 * 392 * imsg callback for messages that are received from the vmm parent process. 393 */ 394 void 395 vm_dispatch_vmm(int fd, short event, void *arg) 396 { 397 struct vmd_vm *vm = arg; 398 struct vmop_result vmr; 399 struct vmop_addr_result var; 400 struct imsgev *iev = &vm->vm_iev; 401 struct imsgbuf *ibuf = &iev->ibuf; 402 struct imsg imsg; 403 ssize_t n; 404 int verbose; 405 406 if (event & EV_READ) { 407 if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN) 408 fatal("%s: imsg_read", __func__); 409 if (n == 0) 410 _exit(0); 411 } 412 413 if (event & EV_WRITE) { 414 if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN) 415 fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd); 416 if (n == 0) 417 _exit(0); 418 } 419 420 for (;;) { 421 if ((n = imsg_get(ibuf, &imsg)) == -1) 422 fatal("%s: imsg_get", __func__); 423 if (n == 0) 424 break; 425 426 #if DEBUG > 1 427 log_debug("%s: got imsg %d from %s", 428 __func__, imsg.hdr.type, 429 vm->vm_params.vmc_params.vcp_name); 430 #endif 431 432 switch (imsg.hdr.type) { 433 case IMSG_CTL_VERBOSE: 434 IMSG_SIZE_CHECK(&imsg, &verbose); 435 memcpy(&verbose, imsg.data, sizeof(verbose)); 436 log_setverbose(verbose); 437 break; 438 case IMSG_VMDOP_VM_SHUTDOWN: 439 if (vmmci_ctl(VMMCI_SHUTDOWN) == -1) 440 _exit(0); 441 break; 442 case IMSG_VMDOP_VM_REBOOT: 443 if (vmmci_ctl(VMMCI_REBOOT) == -1) 444 _exit(0); 445 break; 446 case IMSG_VMDOP_PAUSE_VM: 447 vmr.vmr_result = 0; 448 vmr.vmr_id = vm->vm_vmid; 449 pause_vm(&vm->vm_params.vmc_params); 450 imsg_compose_event(&vm->vm_iev, 451 IMSG_VMDOP_PAUSE_VM_RESPONSE, 452 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 453 sizeof(vmr)); 454 break; 455 case IMSG_VMDOP_UNPAUSE_VM: 456 vmr.vmr_result = 0; 457 vmr.vmr_id = vm->vm_vmid; 458 unpause_vm(&vm->vm_params.vmc_params); 459 imsg_compose_event(&vm->vm_iev, 460 IMSG_VMDOP_UNPAUSE_VM_RESPONSE, 461 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 462 sizeof(vmr)); 463 break; 464 case IMSG_VMDOP_SEND_VM_REQUEST: 465 vmr.vmr_id = vm->vm_vmid; 466 vmr.vmr_result = send_vm(imsg.fd, 467 &vm->vm_params.vmc_params); 468 imsg_compose_event(&vm->vm_iev, 469 IMSG_VMDOP_SEND_VM_RESPONSE, 470 imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr, 471 sizeof(vmr)); 472 if (!vmr.vmr_result) { 473 imsg_flush(¤t_vm->vm_iev.ibuf); 474 _exit(0); 475 } 476 break; 477 case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE: 478 IMSG_SIZE_CHECK(&imsg, &var); 479 memcpy(&var, imsg.data, sizeof(var)); 480 481 log_debug("%s: received tap addr %s for nic %d", 482 vm->vm_params.vmc_params.vcp_name, 483 ether_ntoa((void *)var.var_addr), var.var_nic_idx); 484 485 vionet_set_hostmac(vm, var.var_nic_idx, var.var_addr); 486 break; 487 default: 488 fatalx("%s: got invalid imsg %d from %s", 489 __func__, imsg.hdr.type, 490 vm->vm_params.vmc_params.vcp_name); 491 } 492 imsg_free(&imsg); 493 } 494 imsg_event_add(iev); 495 } 496 497 /* 498 * vm_shutdown 499 * 500 * Tell the vmm parent process to shutdown or reboot the VM and exit. 501 */ 502 __dead void 503 vm_shutdown(unsigned int cmd) 504 { 505 switch (cmd) { 506 case VMMCI_NONE: 507 case VMMCI_SHUTDOWN: 508 (void)imsg_compose_event(¤t_vm->vm_iev, 509 IMSG_VMDOP_VM_SHUTDOWN, 0, 0, -1, NULL, 0); 510 break; 511 case VMMCI_REBOOT: 512 (void)imsg_compose_event(¤t_vm->vm_iev, 513 IMSG_VMDOP_VM_REBOOT, 0, 0, -1, NULL, 0); 514 break; 515 default: 516 fatalx("invalid vm ctl command: %d", cmd); 517 } 518 imsg_flush(¤t_vm->vm_iev.ibuf); 519 520 _exit(0); 521 } 522 523 int 524 send_vm(int fd, struct vm_create_params *vcp) 525 { 526 struct vm_rwregs_params vrp; 527 struct vm_rwvmparams_params vpp; 528 struct vmop_create_params *vmc; 529 struct vm_terminate_params vtp; 530 unsigned int flags = 0; 531 unsigned int i; 532 int ret = 0; 533 size_t sz; 534 535 if (dump_send_header(fd)) { 536 log_info("%s: failed to send vm dump header", __func__); 537 goto err; 538 } 539 540 pause_vm(vcp); 541 542 vmc = calloc(1, sizeof(struct vmop_create_params)); 543 if (vmc == NULL) { 544 log_warn("%s: calloc error geting vmc", __func__); 545 ret = -1; 546 goto err; 547 } 548 549 flags |= VMOP_CREATE_MEMORY; 550 memcpy(&vmc->vmc_params, ¤t_vm->vm_params, sizeof(struct 551 vmop_create_params)); 552 vmc->vmc_flags = flags; 553 vrp.vrwp_vm_id = vcp->vcp_id; 554 vrp.vrwp_mask = VM_RWREGS_ALL; 555 vpp.vpp_mask = VM_RWVMPARAMS_ALL; 556 vpp.vpp_vm_id = vcp->vcp_id; 557 558 sz = atomicio(vwrite, fd, vmc,sizeof(struct vmop_create_params)); 559 if (sz != sizeof(struct vmop_create_params)) { 560 ret = -1; 561 goto err; 562 } 563 564 for (i = 0; i < vcp->vcp_ncpus; i++) { 565 vrp.vrwp_vcpu_id = i; 566 if ((ret = ioctl(env->vmd_fd, VMM_IOC_READREGS, &vrp))) { 567 log_warn("%s: readregs failed", __func__); 568 goto err; 569 } 570 571 sz = atomicio(vwrite, fd, &vrp, 572 sizeof(struct vm_rwregs_params)); 573 if (sz != sizeof(struct vm_rwregs_params)) { 574 log_warn("%s: dumping registers failed", __func__); 575 ret = -1; 576 goto err; 577 } 578 } 579 580 if ((ret = i8253_dump(fd))) 581 goto err; 582 if ((ret = i8259_dump(fd))) 583 goto err; 584 if ((ret = ns8250_dump(fd))) 585 goto err; 586 if ((ret = mc146818_dump(fd))) 587 goto err; 588 if ((ret = fw_cfg_dump(fd))) 589 goto err; 590 if ((ret = pci_dump(fd))) 591 goto err; 592 if ((ret = virtio_dump(fd))) 593 goto err; 594 if ((ret = dump_mem(fd, vcp))) 595 goto err; 596 597 for (i = 0; i < vcp->vcp_ncpus; i++) { 598 vpp.vpp_vcpu_id = i; 599 if ((ret = ioctl(env->vmd_fd, VMM_IOC_READVMPARAMS, &vpp))) { 600 log_warn("%s: readvmparams failed", __func__); 601 goto err; 602 } 603 604 sz = atomicio(vwrite, fd, &vpp, 605 sizeof(struct vm_rwvmparams_params)); 606 if (sz != sizeof(struct vm_rwvmparams_params)) { 607 log_warn("%s: dumping vm params failed", __func__); 608 ret = -1; 609 goto err; 610 } 611 } 612 613 vtp.vtp_vm_id = vcp->vcp_id; 614 if (ioctl(env->vmd_fd, VMM_IOC_TERM, &vtp) == -1) { 615 log_warnx("%s: term IOC error: %d, %d", __func__, 616 errno, ENOENT); 617 } 618 err: 619 close(fd); 620 if (ret) 621 unpause_vm(vcp); 622 return ret; 623 } 624 625 int 626 dump_send_header(int fd) { 627 struct vm_dump_header vmh; 628 int i; 629 630 memcpy(&vmh.vmh_signature, VM_DUMP_SIGNATURE, 631 sizeof(vmh.vmh_signature)); 632 633 vmh.vmh_cpuids[0].code = 0x00; 634 vmh.vmh_cpuids[0].leaf = 0x00; 635 636 vmh.vmh_cpuids[1].code = 0x01; 637 vmh.vmh_cpuids[1].leaf = 0x00; 638 639 vmh.vmh_cpuids[2].code = 0x07; 640 vmh.vmh_cpuids[2].leaf = 0x00; 641 642 vmh.vmh_cpuids[3].code = 0x0d; 643 vmh.vmh_cpuids[3].leaf = 0x00; 644 645 vmh.vmh_cpuids[4].code = 0x80000001; 646 vmh.vmh_cpuids[4].leaf = 0x00; 647 648 vmh.vmh_version = VM_DUMP_VERSION; 649 650 for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) { 651 CPUID_LEAF(vmh.vmh_cpuids[i].code, 652 vmh.vmh_cpuids[i].leaf, 653 vmh.vmh_cpuids[i].a, 654 vmh.vmh_cpuids[i].b, 655 vmh.vmh_cpuids[i].c, 656 vmh.vmh_cpuids[i].d); 657 } 658 659 if (atomicio(vwrite, fd, &vmh, sizeof(vmh)) != sizeof(vmh)) 660 return (-1); 661 662 return (0); 663 } 664 665 int 666 dump_mem(int fd, struct vm_create_params *vcp) 667 { 668 unsigned int i; 669 int ret; 670 struct vm_mem_range *vmr; 671 672 for (i = 0; i < vcp->vcp_nmemranges; i++) { 673 vmr = &vcp->vcp_memranges[i]; 674 ret = dump_vmr(fd, vmr); 675 if (ret) 676 return ret; 677 } 678 return (0); 679 } 680 681 int 682 restore_vm_params(int fd, struct vm_create_params *vcp) { 683 unsigned int i; 684 struct vm_rwvmparams_params vpp; 685 686 for (i = 0; i < vcp->vcp_ncpus; i++) { 687 if (atomicio(read, fd, &vpp, sizeof(vpp)) != sizeof(vpp)) { 688 log_warn("%s: error restoring vm params", __func__); 689 return (-1); 690 } 691 vpp.vpp_vm_id = vcp->vcp_id; 692 vpp.vpp_vcpu_id = i; 693 if (ioctl(env->vmd_fd, VMM_IOC_WRITEVMPARAMS, &vpp) < 0) { 694 log_debug("%s: writing vm params failed", __func__); 695 return (-1); 696 } 697 } 698 return (0); 699 } 700 701 void 702 restore_mem(int fd, struct vm_create_params *vcp) 703 { 704 unsigned int i; 705 struct vm_mem_range *vmr; 706 707 for (i = 0; i < vcp->vcp_nmemranges; i++) { 708 vmr = &vcp->vcp_memranges[i]; 709 restore_vmr(fd, vmr); 710 } 711 } 712 713 int 714 dump_vmr(int fd, struct vm_mem_range *vmr) 715 { 716 size_t rem = vmr->vmr_size, read=0; 717 char buf[PAGE_SIZE]; 718 719 while (rem > 0) { 720 if (read_mem(vmr->vmr_gpa + read, buf, PAGE_SIZE)) { 721 log_warn("failed to read vmr"); 722 return (-1); 723 } 724 if (atomicio(vwrite, fd, buf, sizeof(buf)) != sizeof(buf)) { 725 log_warn("failed to dump vmr"); 726 return (-1); 727 } 728 rem = rem - PAGE_SIZE; 729 read = read + PAGE_SIZE; 730 } 731 return (0); 732 } 733 734 void 735 restore_vmr(int fd, struct vm_mem_range *vmr) 736 { 737 size_t rem = vmr->vmr_size, wrote=0; 738 char buf[PAGE_SIZE]; 739 740 while (rem > 0) { 741 if (atomicio(read, fd, buf, sizeof(buf)) != sizeof(buf)) 742 fatal("failed to restore vmr"); 743 if (write_mem(vmr->vmr_gpa + wrote, buf, PAGE_SIZE)) 744 fatal("failed to write vmr"); 745 rem = rem - PAGE_SIZE; 746 wrote = wrote + PAGE_SIZE; 747 } 748 } 749 750 void 751 pause_vm(struct vm_create_params *vcp) 752 { 753 unsigned int n; 754 int ret; 755 if (current_vm->vm_state & VM_STATE_PAUSED) 756 return; 757 758 current_vm->vm_state |= VM_STATE_PAUSED; 759 760 ret = pthread_barrier_init(&vm_pause_barrier, NULL, vcp->vcp_ncpus + 1); 761 if (ret) { 762 log_warnx("%s: cannot initialize pause barrier (%d)", 763 __progname, ret); 764 return; 765 } 766 767 for (n = 0; n < vcp->vcp_ncpus; n++) { 768 ret = pthread_cond_broadcast(&vcpu_run_cond[n]); 769 if (ret) { 770 log_warnx("%s: can't broadcast vcpu run cond (%d)", 771 __func__, (int)ret); 772 return; 773 } 774 } 775 ret = pthread_barrier_wait(&vm_pause_barrier); 776 if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) { 777 log_warnx("%s: could not wait on pause barrier (%d)", 778 __func__, (int)ret); 779 return; 780 } 781 782 ret = pthread_barrier_destroy(&vm_pause_barrier); 783 if (ret) { 784 log_warnx("%s: could not destroy pause barrier (%d)", 785 __progname, ret); 786 return; 787 } 788 789 i8253_stop(); 790 mc146818_stop(); 791 ns8250_stop(); 792 virtio_stop(vcp); 793 } 794 795 void 796 unpause_vm(struct vm_create_params *vcp) 797 { 798 unsigned int n; 799 int ret; 800 if (!(current_vm->vm_state & VM_STATE_PAUSED)) 801 return; 802 803 current_vm->vm_state &= ~VM_STATE_PAUSED; 804 for (n = 0; n < vcp->vcp_ncpus; n++) { 805 ret = pthread_cond_broadcast(&vcpu_unpause_cond[n]); 806 if (ret) { 807 log_warnx("%s: can't broadcast vcpu unpause cond (%d)", 808 __func__, (int)ret); 809 return; 810 } 811 } 812 813 i8253_start(); 814 mc146818_start(); 815 ns8250_start(); 816 virtio_start(vcp); 817 } 818 819 /* 820 * vcpu_reset 821 * 822 * Requests vmm(4) to reset the VCPUs in the indicated VM to 823 * the register state provided 824 * 825 * Parameters 826 * vmid: VM ID to reset 827 * vcpu_id: VCPU ID to reset 828 * vrs: the register state to initialize 829 * 830 * Return values: 831 * 0: success 832 * !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not 833 * valid) 834 */ 835 int 836 vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs) 837 { 838 struct vm_resetcpu_params vrp; 839 840 memset(&vrp, 0, sizeof(vrp)); 841 vrp.vrp_vm_id = vmid; 842 vrp.vrp_vcpu_id = vcpu_id; 843 memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state)); 844 845 log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid); 846 847 if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) == -1) 848 return (errno); 849 850 return (0); 851 } 852 853 /* 854 * create_memory_map 855 * 856 * Sets up the guest physical memory ranges that the VM can access. 857 * 858 * Parameters: 859 * vcp: VM create parameters describing the VM whose memory map 860 * is being created 861 * 862 * Return values: 863 * nothing 864 */ 865 void 866 create_memory_map(struct vm_create_params *vcp) 867 { 868 size_t len, mem_bytes, mem_mb; 869 870 mem_mb = vcp->vcp_memranges[0].vmr_size; 871 vcp->vcp_nmemranges = 0; 872 if (mem_mb < 1 || mem_mb > VMM_MAX_VM_MEM_SIZE) 873 return; 874 875 mem_bytes = mem_mb * 1024 * 1024; 876 877 /* First memory region: 0 - LOWMEM_KB (DOS low mem) */ 878 len = LOWMEM_KB * 1024; 879 vcp->vcp_memranges[0].vmr_gpa = 0x0; 880 vcp->vcp_memranges[0].vmr_size = len; 881 mem_bytes -= len; 882 883 /* 884 * Second memory region: LOWMEM_KB - 1MB. 885 * 886 * N.B. - Normally ROMs or parts of video RAM are mapped here. 887 * We have to add this region, because some systems 888 * unconditionally write to 0xb8000 (VGA RAM), and 889 * we need to make sure that vmm(4) permits accesses 890 * to it. So allocate guest memory for it. 891 */ 892 len = 0x100000 - LOWMEM_KB * 1024; 893 vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024; 894 vcp->vcp_memranges[1].vmr_size = len; 895 mem_bytes -= len; 896 897 /* Make sure that we do not place physical memory into MMIO ranges. */ 898 if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - 0x100000) 899 len = VMM_PCI_MMIO_BAR_BASE - 0x100000; 900 else 901 len = mem_bytes; 902 903 /* Third memory region: 1MB - (1MB + len) */ 904 vcp->vcp_memranges[2].vmr_gpa = 0x100000; 905 vcp->vcp_memranges[2].vmr_size = len; 906 mem_bytes -= len; 907 908 if (mem_bytes > 0) { 909 /* Fourth memory region for the remaining memory (if any) */ 910 vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1; 911 vcp->vcp_memranges[3].vmr_size = mem_bytes; 912 vcp->vcp_nmemranges = 4; 913 } else 914 vcp->vcp_nmemranges = 3; 915 } 916 917 /* 918 * alloc_guest_mem 919 * 920 * Allocates memory for the guest. 921 * Instead of doing a single allocation with one mmap(), we allocate memory 922 * separately for every range for the following reasons: 923 * - ASLR for the individual ranges 924 * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to 925 * map the single mmap'd userspace memory to the individual guest physical 926 * memory ranges, the underlying amap of the single mmap'd range would have 927 * to allocate per-page reference counters. The reason is that the 928 * individual guest physical ranges would reference the single mmap'd region 929 * only partially. However, if every guest physical range has its own 930 * corresponding mmap'd userspace allocation, there are no partial 931 * references: every guest physical range fully references an mmap'd 932 * range => no per-page reference counters have to be allocated. 933 * 934 * Return values: 935 * 0: success 936 * !0: failure - errno indicating the source of the failure 937 */ 938 int 939 alloc_guest_mem(struct vm_create_params *vcp) 940 { 941 void *p; 942 int ret; 943 size_t i, j; 944 struct vm_mem_range *vmr; 945 946 for (i = 0; i < vcp->vcp_nmemranges; i++) { 947 vmr = &vcp->vcp_memranges[i]; 948 p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE, 949 MAP_PRIVATE | MAP_ANON, -1, 0); 950 if (p == MAP_FAILED) { 951 ret = errno; 952 for (j = 0; j < i; j++) { 953 vmr = &vcp->vcp_memranges[j]; 954 munmap((void *)vmr->vmr_va, vmr->vmr_size); 955 } 956 957 return (ret); 958 } 959 960 vmr->vmr_va = (vaddr_t)p; 961 } 962 963 return (0); 964 } 965 966 /* 967 * vmm_create_vm 968 * 969 * Requests vmm(4) to create a new VM using the supplied creation 970 * parameters. This operation results in the creation of the in-kernel 971 * structures for the VM, but does not start the VM's vcpu(s). 972 * 973 * Parameters: 974 * vcp: vm_create_params struct containing the VM's desired creation 975 * configuration 976 * 977 * Return values: 978 * 0: success 979 * !0 : ioctl to vmm(4) failed 980 */ 981 int 982 vmm_create_vm(struct vm_create_params *vcp) 983 { 984 /* Sanity check arguments */ 985 if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) 986 return (EINVAL); 987 988 if (vcp->vcp_nmemranges == 0 || 989 vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES) 990 return (EINVAL); 991 992 if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) 993 return (EINVAL); 994 995 if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) 996 return (EINVAL); 997 998 if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) == -1) 999 return (errno); 1000 1001 return (0); 1002 } 1003 1004 /* 1005 * init_emulated_hw 1006 * 1007 * Initializes the userspace hardware emulation 1008 */ 1009 void 1010 init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom, 1011 int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps) 1012 { 1013 struct vm_create_params *vcp = &vmc->vmc_params; 1014 int i; 1015 uint64_t memlo, memhi; 1016 1017 /* Calculate memory size for NVRAM registers */ 1018 memlo = memhi = 0; 1019 if (vcp->vcp_nmemranges > 2) 1020 memlo = vcp->vcp_memranges[2].vmr_size - 15 * 0x100000; 1021 1022 if (vcp->vcp_nmemranges > 3) 1023 memhi = vcp->vcp_memranges[3].vmr_size; 1024 1025 /* Reset the IO port map */ 1026 memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS); 1027 1028 /* Init i8253 PIT */ 1029 i8253_init(vcp->vcp_id); 1030 ioports_map[TIMER_CTRL] = vcpu_exit_i8253; 1031 ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253; 1032 ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253; 1033 ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253; 1034 ioports_map[PCKBC_AUX] = vcpu_exit_i8253_misc; 1035 1036 /* Init mc146818 RTC */ 1037 mc146818_init(vcp->vcp_id, memlo, memhi); 1038 ioports_map[IO_RTC] = vcpu_exit_mc146818; 1039 ioports_map[IO_RTC + 1] = vcpu_exit_mc146818; 1040 1041 /* Init master and slave PICs */ 1042 i8259_init(); 1043 ioports_map[IO_ICU1] = vcpu_exit_i8259; 1044 ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259; 1045 ioports_map[IO_ICU2] = vcpu_exit_i8259; 1046 ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259; 1047 ioports_map[ELCR0] = vcpu_exit_elcr; 1048 ioports_map[ELCR1] = vcpu_exit_elcr; 1049 1050 /* Init ns8250 UART */ 1051 ns8250_init(con_fd, vcp->vcp_id); 1052 for (i = COM1_DATA; i <= COM1_SCR; i++) 1053 ioports_map[i] = vcpu_exit_com; 1054 1055 /* Init QEMU fw_cfg interface */ 1056 fw_cfg_init(vmc); 1057 ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg; 1058 ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg; 1059 ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma; 1060 ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma; 1061 1062 /* Initialize PCI */ 1063 for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++) 1064 ioports_map[i] = vcpu_exit_pci; 1065 1066 ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci; 1067 ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci; 1068 ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci; 1069 ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci; 1070 ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci; 1071 pci_init(); 1072 1073 /* Initialize virtio devices */ 1074 virtio_init(current_vm, child_cdrom, child_disks, child_taps); 1075 } 1076 /* 1077 * restore_emulated_hw 1078 * 1079 * Restores the userspace hardware emulation from fd 1080 */ 1081 void 1082 restore_emulated_hw(struct vm_create_params *vcp, int fd, 1083 int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom) 1084 { 1085 /* struct vm_create_params *vcp = &vmc->vmc_params; */ 1086 int i; 1087 memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS); 1088 1089 /* Init i8253 PIT */ 1090 i8253_restore(fd, vcp->vcp_id); 1091 ioports_map[TIMER_CTRL] = vcpu_exit_i8253; 1092 ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253; 1093 ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253; 1094 ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253; 1095 1096 /* Init master and slave PICs */ 1097 i8259_restore(fd); 1098 ioports_map[IO_ICU1] = vcpu_exit_i8259; 1099 ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259; 1100 ioports_map[IO_ICU2] = vcpu_exit_i8259; 1101 ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259; 1102 1103 /* Init ns8250 UART */ 1104 ns8250_restore(fd, con_fd, vcp->vcp_id); 1105 for (i = COM1_DATA; i <= COM1_SCR; i++) 1106 ioports_map[i] = vcpu_exit_com; 1107 1108 /* Init mc146818 RTC */ 1109 mc146818_restore(fd, vcp->vcp_id); 1110 ioports_map[IO_RTC] = vcpu_exit_mc146818; 1111 ioports_map[IO_RTC + 1] = vcpu_exit_mc146818; 1112 1113 /* Init QEMU fw_cfg interface */ 1114 fw_cfg_restore(fd); 1115 ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg; 1116 ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg; 1117 ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma; 1118 ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma; 1119 1120 /* Initialize PCI */ 1121 for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++) 1122 ioports_map[i] = vcpu_exit_pci; 1123 1124 ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci; 1125 ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci; 1126 ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci; 1127 ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci; 1128 ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci; 1129 pci_restore(fd); 1130 virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps); 1131 } 1132 1133 /* 1134 * run_vm 1135 * 1136 * Runs the VM whose creation parameters are specified in vcp 1137 * 1138 * Parameters: 1139 * child_cdrom: previously-opened child ISO disk file descriptor 1140 * child_disks: previously-opened child VM disk file file descriptors 1141 * child_taps: previously-opened child tap file descriptors 1142 * vmc: vmop_create_params struct containing the VM's desired creation 1143 * configuration 1144 * vrs: VCPU register state to initialize 1145 * 1146 * Return values: 1147 * 0: the VM exited normally 1148 * !0 : the VM exited abnormally or failed to start 1149 */ 1150 int 1151 run_vm(int child_cdrom, int child_disks[][VM_MAX_BASE_PER_DISK], 1152 int *child_taps, struct vmop_create_params *vmc, 1153 struct vcpu_reg_state *vrs) 1154 { 1155 struct vm_create_params *vcp = &vmc->vmc_params; 1156 struct vm_rwregs_params vregsp; 1157 uint8_t evdone = 0; 1158 size_t i; 1159 int ret; 1160 pthread_t *tid, evtid; 1161 struct vm_run_params **vrp; 1162 void *exit_status; 1163 1164 if (vcp == NULL) 1165 return (EINVAL); 1166 1167 if (child_cdrom == -1 && strlen(vcp->vcp_cdrom)) 1168 return (EINVAL); 1169 1170 if (child_disks == NULL && vcp->vcp_ndisks != 0) 1171 return (EINVAL); 1172 1173 if (child_taps == NULL && vcp->vcp_nnics != 0) 1174 return (EINVAL); 1175 1176 if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) 1177 return (EINVAL); 1178 1179 if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) 1180 return (EINVAL); 1181 1182 if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) 1183 return (EINVAL); 1184 1185 if (vcp->vcp_nmemranges == 0 || 1186 vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES) 1187 return (EINVAL); 1188 1189 tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t)); 1190 vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *)); 1191 if (tid == NULL || vrp == NULL) { 1192 log_warn("%s: memory allocation error - exiting.", 1193 __progname); 1194 return (ENOMEM); 1195 } 1196 1197 log_debug("%s: initializing hardware for vm %s", __func__, 1198 vcp->vcp_name); 1199 1200 if (!(current_vm->vm_state & VM_STATE_RECEIVED)) 1201 init_emulated_hw(vmc, child_cdrom, child_disks, child_taps); 1202 1203 ret = pthread_mutex_init(&threadmutex, NULL); 1204 if (ret) { 1205 log_warn("%s: could not initialize thread state mutex", 1206 __func__); 1207 return (ret); 1208 } 1209 ret = pthread_cond_init(&threadcond, NULL); 1210 if (ret) { 1211 log_warn("%s: could not initialize thread state " 1212 "condition variable", __func__); 1213 return (ret); 1214 } 1215 1216 mutex_lock(&threadmutex); 1217 1218 log_debug("%s: starting vcpu threads for vm %s", __func__, 1219 vcp->vcp_name); 1220 1221 /* 1222 * Create and launch one thread for each VCPU. These threads may 1223 * migrate between PCPUs over time; the need to reload CPU state 1224 * in such situations is detected and performed by vmm(4) in the 1225 * kernel. 1226 */ 1227 for (i = 0 ; i < vcp->vcp_ncpus; i++) { 1228 vrp[i] = malloc(sizeof(struct vm_run_params)); 1229 if (vrp[i] == NULL) { 1230 log_warn("%s: memory allocation error - " 1231 "exiting.", __progname); 1232 /* caller will exit, so skip freeing */ 1233 return (ENOMEM); 1234 } 1235 vrp[i]->vrp_exit = malloc(sizeof(struct vm_exit)); 1236 if (vrp[i]->vrp_exit == NULL) { 1237 log_warn("%s: memory allocation error - " 1238 "exiting.", __progname); 1239 /* caller will exit, so skip freeing */ 1240 return (ENOMEM); 1241 } 1242 vrp[i]->vrp_vm_id = vcp->vcp_id; 1243 vrp[i]->vrp_vcpu_id = i; 1244 1245 if (vcpu_reset(vcp->vcp_id, i, vrs)) { 1246 log_warnx("%s: cannot reset VCPU %zu - exiting.", 1247 __progname, i); 1248 return (EIO); 1249 } 1250 1251 /* once more because reset_cpu changes regs */ 1252 if (current_vm->vm_state & VM_STATE_RECEIVED) { 1253 vregsp.vrwp_vm_id = vcp->vcp_id; 1254 vregsp.vrwp_vcpu_id = i; 1255 vregsp.vrwp_regs = *vrs; 1256 vregsp.vrwp_mask = VM_RWREGS_ALL; 1257 if ((ret = ioctl(env->vmd_fd, VMM_IOC_WRITEREGS, 1258 &vregsp)) == -1) { 1259 log_warn("%s: writeregs failed", __func__); 1260 return (ret); 1261 } 1262 } 1263 1264 ret = pthread_cond_init(&vcpu_run_cond[i], NULL); 1265 if (ret) { 1266 log_warnx("%s: cannot initialize cond var (%d)", 1267 __progname, ret); 1268 return (ret); 1269 } 1270 1271 ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL); 1272 if (ret) { 1273 log_warnx("%s: cannot initialize mtx (%d)", 1274 __progname, ret); 1275 return (ret); 1276 } 1277 1278 ret = pthread_cond_init(&vcpu_unpause_cond[i], NULL); 1279 if (ret) { 1280 log_warnx("%s: cannot initialize unpause var (%d)", 1281 __progname, ret); 1282 return (ret); 1283 } 1284 1285 ret = pthread_mutex_init(&vcpu_unpause_mtx[i], NULL); 1286 if (ret) { 1287 log_warnx("%s: cannot initialize unpause mtx (%d)", 1288 __progname, ret); 1289 return (ret); 1290 } 1291 1292 vcpu_hlt[i] = 0; 1293 1294 /* Start each VCPU run thread at vcpu_run_loop */ 1295 ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]); 1296 if (ret) { 1297 /* caller will _exit after this return */ 1298 ret = errno; 1299 log_warn("%s: could not create vcpu thread %zu", 1300 __func__, i); 1301 return (ret); 1302 } 1303 } 1304 1305 log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name); 1306 ret = pthread_create(&evtid, NULL, event_thread, &evdone); 1307 if (ret) { 1308 errno = ret; 1309 log_warn("%s: could not create event thread", __func__); 1310 return (ret); 1311 } 1312 1313 for (;;) { 1314 ret = pthread_cond_wait(&threadcond, &threadmutex); 1315 if (ret) { 1316 log_warn("%s: waiting on thread state condition " 1317 "variable failed", __func__); 1318 return (ret); 1319 } 1320 1321 /* 1322 * Did a VCPU thread exit with an error? => return the first one 1323 */ 1324 for (i = 0; i < vcp->vcp_ncpus; i++) { 1325 if (vcpu_done[i] == 0) 1326 continue; 1327 1328 if (pthread_join(tid[i], &exit_status)) { 1329 log_warn("%s: failed to join thread %zd - " 1330 "exiting", __progname, i); 1331 return (EIO); 1332 } 1333 1334 ret = (intptr_t)exit_status; 1335 } 1336 1337 /* Did the event thread exit? => return with an error */ 1338 if (evdone) { 1339 if (pthread_join(evtid, &exit_status)) { 1340 log_warn("%s: failed to join event thread - " 1341 "exiting", __progname); 1342 return (EIO); 1343 } 1344 1345 log_warnx("%s: vm %d event thread exited " 1346 "unexpectedly", __progname, vcp->vcp_id); 1347 return (EIO); 1348 } 1349 1350 /* Did all VCPU threads exit successfully? => return */ 1351 for (i = 0; i < vcp->vcp_ncpus; i++) { 1352 if (vcpu_done[i] == 0) 1353 break; 1354 } 1355 if (i == vcp->vcp_ncpus) 1356 return (ret); 1357 1358 /* Some more threads to wait for, start over */ 1359 } 1360 1361 return (ret); 1362 } 1363 1364 void * 1365 event_thread(void *arg) 1366 { 1367 uint8_t *donep = arg; 1368 intptr_t ret; 1369 1370 ret = event_dispatch(); 1371 1372 mutex_lock(&threadmutex); 1373 *donep = 1; 1374 pthread_cond_signal(&threadcond); 1375 mutex_unlock(&threadmutex); 1376 1377 return (void *)ret; 1378 } 1379 1380 /* 1381 * vcpu_run_loop 1382 * 1383 * Runs a single VCPU until vmm(4) requires help handling an exit, 1384 * or the VM terminates. 1385 * 1386 * Parameters: 1387 * arg: vcpu_run_params for the VCPU being run by this thread 1388 * 1389 * Return values: 1390 * NULL: the VCPU shutdown properly 1391 * !NULL: error processing VCPU run, or the VCPU shutdown abnormally 1392 */ 1393 void * 1394 vcpu_run_loop(void *arg) 1395 { 1396 struct vm_run_params *vrp = (struct vm_run_params *)arg; 1397 intptr_t ret = 0; 1398 int irq; 1399 uint32_t n; 1400 1401 vrp->vrp_continue = 0; 1402 n = vrp->vrp_vcpu_id; 1403 1404 for (;;) { 1405 ret = pthread_mutex_lock(&vcpu_run_mtx[n]); 1406 1407 if (ret) { 1408 log_warnx("%s: can't lock vcpu run mtx (%d)", 1409 __func__, (int)ret); 1410 return ((void *)ret); 1411 } 1412 1413 /* If we are halted and need to pause, pause */ 1414 if (vcpu_hlt[n] && (current_vm->vm_state & VM_STATE_PAUSED)) { 1415 ret = pthread_barrier_wait(&vm_pause_barrier); 1416 if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) { 1417 log_warnx("%s: could not wait on pause barrier (%d)", 1418 __func__, (int)ret); 1419 return ((void *)ret); 1420 } 1421 1422 ret = pthread_mutex_lock(&vcpu_unpause_mtx[n]); 1423 if (ret) { 1424 log_warnx("%s: can't lock vcpu unpause mtx (%d)", 1425 __func__, (int)ret); 1426 return ((void *)ret); 1427 } 1428 1429 ret = pthread_cond_wait(&vcpu_unpause_cond[n], 1430 &vcpu_unpause_mtx[n]); 1431 if (ret) { 1432 log_warnx( 1433 "%s: can't wait on unpause cond (%d)", 1434 __func__, (int)ret); 1435 break; 1436 } 1437 ret = pthread_mutex_unlock(&vcpu_unpause_mtx[n]); 1438 if (ret) { 1439 log_warnx("%s: can't unlock unpause mtx (%d)", 1440 __func__, (int)ret); 1441 break; 1442 } 1443 } 1444 1445 /* If we are halted and not paused, wait */ 1446 if (vcpu_hlt[n]) { 1447 ret = pthread_cond_wait(&vcpu_run_cond[n], 1448 &vcpu_run_mtx[n]); 1449 1450 if (ret) { 1451 log_warnx( 1452 "%s: can't wait on cond (%d)", 1453 __func__, (int)ret); 1454 (void)pthread_mutex_unlock( 1455 &vcpu_run_mtx[n]); 1456 break; 1457 } 1458 } 1459 1460 ret = pthread_mutex_unlock(&vcpu_run_mtx[n]); 1461 1462 if (ret) { 1463 log_warnx("%s: can't unlock mutex on cond (%d)", 1464 __func__, (int)ret); 1465 break; 1466 } 1467 1468 if (vrp->vrp_irqready && i8259_is_pending()) { 1469 irq = i8259_ack(); 1470 vrp->vrp_irq = irq; 1471 } else 1472 vrp->vrp_irq = 0xFFFF; 1473 1474 /* Still more pending? */ 1475 if (i8259_is_pending()) { 1476 /* 1477 * XXX can probably avoid ioctls here by providing intr 1478 * in vrp 1479 */ 1480 if (vcpu_pic_intr(vrp->vrp_vm_id, 1481 vrp->vrp_vcpu_id, 1)) { 1482 fatal("can't set INTR"); 1483 } 1484 } else { 1485 if (vcpu_pic_intr(vrp->vrp_vm_id, 1486 vrp->vrp_vcpu_id, 0)) { 1487 fatal("can't clear INTR"); 1488 } 1489 } 1490 1491 if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) == -1) { 1492 /* If run ioctl failed, exit */ 1493 ret = errno; 1494 log_warn("%s: vm %d / vcpu %d run ioctl failed", 1495 __func__, vrp->vrp_vm_id, n); 1496 break; 1497 } 1498 1499 /* If the VM is terminating, exit normally */ 1500 if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) { 1501 ret = (intptr_t)NULL; 1502 break; 1503 } 1504 1505 if (vrp->vrp_exit_reason != VM_EXIT_NONE) { 1506 /* 1507 * vmm(4) needs help handling an exit, handle in 1508 * vcpu_exit. 1509 */ 1510 ret = vcpu_exit(vrp); 1511 if (ret) 1512 break; 1513 } 1514 } 1515 1516 mutex_lock(&threadmutex); 1517 vcpu_done[n] = 1; 1518 pthread_cond_signal(&threadcond); 1519 mutex_unlock(&threadmutex); 1520 1521 return ((void *)ret); 1522 } 1523 1524 int 1525 vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr) 1526 { 1527 struct vm_intr_params vip; 1528 1529 memset(&vip, 0, sizeof(vip)); 1530 1531 vip.vip_vm_id = vm_id; 1532 vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */ 1533 vip.vip_intr = intr; 1534 1535 if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) == -1) 1536 return (errno); 1537 1538 return (0); 1539 } 1540 1541 /* 1542 * vcpu_exit_pci 1543 * 1544 * Handle all I/O to the emulated PCI subsystem. 1545 * 1546 * Parameters: 1547 * vrp: vcpu run paramters containing guest state for this exit 1548 * 1549 * Return value: 1550 * Interrupt to inject to the guest VM, or 0xFF if no interrupt should 1551 * be injected. 1552 */ 1553 uint8_t 1554 vcpu_exit_pci(struct vm_run_params *vrp) 1555 { 1556 struct vm_exit *vei = vrp->vrp_exit; 1557 uint8_t intr; 1558 1559 intr = 0xFF; 1560 1561 switch (vei->vei.vei_port) { 1562 case PCI_MODE1_ADDRESS_REG: 1563 pci_handle_address_reg(vrp); 1564 break; 1565 case PCI_MODE1_DATA_REG: 1566 case PCI_MODE1_DATA_REG + 1: 1567 case PCI_MODE1_DATA_REG + 2: 1568 case PCI_MODE1_DATA_REG + 3: 1569 pci_handle_data_reg(vrp); 1570 break; 1571 case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END: 1572 intr = pci_handle_io(vrp); 1573 break; 1574 default: 1575 log_warnx("%s: unknown PCI register 0x%llx", 1576 __progname, (uint64_t)vei->vei.vei_port); 1577 break; 1578 } 1579 1580 return (intr); 1581 } 1582 1583 /* 1584 * vcpu_exit_inout 1585 * 1586 * Handle all I/O exits that need to be emulated in vmd. This includes the 1587 * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device. 1588 * 1589 * Parameters: 1590 * vrp: vcpu run parameters containing guest state for this exit 1591 */ 1592 void 1593 vcpu_exit_inout(struct vm_run_params *vrp) 1594 { 1595 struct vm_exit *vei = vrp->vrp_exit; 1596 uint8_t intr = 0xFF; 1597 1598 if (ioports_map[vei->vei.vei_port] != NULL) 1599 intr = ioports_map[vei->vei.vei_port](vrp); 1600 else if (vei->vei.vei_dir == VEI_DIR_IN) 1601 set_return_data(vei, 0xFFFFFFFF); 1602 1603 if (intr != 0xFF) 1604 vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr); 1605 } 1606 1607 /* 1608 * vcpu_exit_eptviolation 1609 * 1610 * handle an EPT Violation 1611 * 1612 * Parameters: 1613 * vrp: vcpu run parameters containing guest state for this exit 1614 * 1615 * Return values: 1616 * 0: no action required 1617 * EAGAIN: a protection fault occured, kill the vm. 1618 */ 1619 int 1620 vcpu_exit_eptviolation(struct vm_run_params *vrp) 1621 { 1622 struct vm_exit *ve = vrp->vrp_exit; 1623 1624 /* 1625 * vmd may be exiting to vmd to handle a pending interrupt 1626 * but last exit type may have been VMX_EXIT_EPT_VIOLATION, 1627 * check the fault_type to ensure we really are processing 1628 * a VMX_EXIT_EPT_VIOLATION. 1629 */ 1630 if (ve->vee.vee_fault_type == VEE_FAULT_PROTECT) { 1631 log_debug("%s: EPT Violation: rip=0x%llx", 1632 __progname, vrp->vrp_exit->vrs.vrs_gprs[VCPU_REGS_RIP]); 1633 return (EAGAIN); 1634 } 1635 1636 return (0); 1637 } 1638 1639 /* 1640 * vcpu_exit 1641 * 1642 * Handle a vcpu exit. This function is called when it is determined that 1643 * vmm(4) requires the assistance of vmd to support a particular guest 1644 * exit type (eg, accessing an I/O port or device). Guest state is contained 1645 * in 'vrp', and will be resent to vmm(4) on exit completion. 1646 * 1647 * Upon conclusion of handling the exit, the function determines if any 1648 * interrupts should be injected into the guest, and asserts the proper 1649 * IRQ line whose interrupt should be vectored. 1650 * 1651 * Parameters: 1652 * vrp: vcpu run parameters containing guest state for this exit 1653 * 1654 * Return values: 1655 * 0: the exit was handled successfully 1656 * 1: an error occurred (eg, unknown exit reason passed in 'vrp') 1657 */ 1658 int 1659 vcpu_exit(struct vm_run_params *vrp) 1660 { 1661 int ret; 1662 1663 switch (vrp->vrp_exit_reason) { 1664 case VMX_EXIT_INT_WINDOW: 1665 case SVM_VMEXIT_VINTR: 1666 case VMX_EXIT_CPUID: 1667 case VMX_EXIT_EXTINT: 1668 case SVM_VMEXIT_INTR: 1669 case SVM_VMEXIT_NPF: 1670 case SVM_VMEXIT_MSR: 1671 case SVM_VMEXIT_CPUID: 1672 /* 1673 * We may be exiting to vmd to handle a pending interrupt but 1674 * at the same time the last exit type may have been one of 1675 * these. In this case, there's nothing extra to be done 1676 * here (and falling through to the default case below results 1677 * in more vmd log spam). 1678 */ 1679 break; 1680 case VMX_EXIT_EPT_VIOLATION: 1681 ret = vcpu_exit_eptviolation(vrp); 1682 if (ret) 1683 return (ret); 1684 1685 break; 1686 case VMX_EXIT_IO: 1687 case SVM_VMEXIT_IOIO: 1688 vcpu_exit_inout(vrp); 1689 break; 1690 case VMX_EXIT_HLT: 1691 case SVM_VMEXIT_HLT: 1692 ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]); 1693 if (ret) { 1694 log_warnx("%s: can't lock vcpu mutex (%d)", 1695 __func__, ret); 1696 return (ret); 1697 } 1698 vcpu_hlt[vrp->vrp_vcpu_id] = 1; 1699 ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]); 1700 if (ret) { 1701 log_warnx("%s: can't unlock vcpu mutex (%d)", 1702 __func__, ret); 1703 return (ret); 1704 } 1705 break; 1706 case VMX_EXIT_TRIPLE_FAULT: 1707 case SVM_VMEXIT_SHUTDOWN: 1708 /* reset VM */ 1709 return (EAGAIN); 1710 default: 1711 log_debug("%s: unknown exit reason 0x%x", 1712 __progname, vrp->vrp_exit_reason); 1713 } 1714 1715 /* Process any pending traffic */ 1716 vionet_process_rx(vrp->vrp_vm_id); 1717 1718 vrp->vrp_continue = 1; 1719 1720 return (0); 1721 } 1722 1723 /* 1724 * find_gpa_range 1725 * 1726 * Search for a contiguous guest physical mem range. 1727 * 1728 * Parameters: 1729 * vcp: VM create parameters that contain the memory map to search in 1730 * gpa: the starting guest physical address 1731 * len: the length of the memory range 1732 * 1733 * Return values: 1734 * NULL: on failure if there is no memory range as described by the parameters 1735 * Pointer to vm_mem_range that contains the start of the range otherwise. 1736 */ 1737 static struct vm_mem_range * 1738 find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len) 1739 { 1740 size_t i, n; 1741 struct vm_mem_range *vmr; 1742 1743 /* Find the first vm_mem_range that contains gpa */ 1744 for (i = 0; i < vcp->vcp_nmemranges; i++) { 1745 vmr = &vcp->vcp_memranges[i]; 1746 if (vmr->vmr_gpa + vmr->vmr_size >= gpa) 1747 break; 1748 } 1749 1750 /* No range found. */ 1751 if (i == vcp->vcp_nmemranges) 1752 return (NULL); 1753 1754 /* 1755 * vmr may cover the range [gpa, gpa + len) only partly. Make 1756 * sure that the following vm_mem_ranges are contiguous and 1757 * cover the rest. 1758 */ 1759 n = vmr->vmr_size - (gpa - vmr->vmr_gpa); 1760 if (len < n) 1761 len = 0; 1762 else 1763 len -= n; 1764 gpa = vmr->vmr_gpa + vmr->vmr_size; 1765 for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) { 1766 vmr = &vcp->vcp_memranges[i]; 1767 if (gpa != vmr->vmr_gpa) 1768 return (NULL); 1769 if (len <= vmr->vmr_size) 1770 len = 0; 1771 else 1772 len -= vmr->vmr_size; 1773 1774 gpa = vmr->vmr_gpa + vmr->vmr_size; 1775 } 1776 1777 if (len != 0) 1778 return (NULL); 1779 1780 return (vmr); 1781 } 1782 1783 void * 1784 vaddr_mem(paddr_t gpa, size_t len) 1785 { 1786 struct vm_create_params *vcp = ¤t_vm->vm_params.vmc_params; 1787 size_t i; 1788 struct vm_mem_range *vmr; 1789 paddr_t gpend = gpa + len; 1790 1791 /* Find the first vm_mem_range that contains gpa */ 1792 for (i = 0; i < vcp->vcp_nmemranges; i++) { 1793 vmr = &vcp->vcp_memranges[i]; 1794 if (gpa < vmr->vmr_gpa) 1795 continue; 1796 1797 if (gpend >= vmr->vmr_gpa + vmr->vmr_size) 1798 continue; 1799 1800 return ((char *)vmr->vmr_va + (gpa - vmr->vmr_gpa)); 1801 } 1802 1803 return (NULL); 1804 } 1805 1806 /* 1807 * write_mem 1808 * 1809 * Copies data from 'buf' into the guest VM's memory at paddr 'dst'. 1810 * 1811 * Parameters: 1812 * dst: the destination paddr_t in the guest VM 1813 * buf: data to copy (or NULL to zero the data) 1814 * len: number of bytes to copy 1815 * 1816 * Return values: 1817 * 0: success 1818 * EINVAL: if the guest physical memory range [dst, dst + len) does not 1819 * exist in the guest. 1820 */ 1821 int 1822 write_mem(paddr_t dst, const void *buf, size_t len) 1823 { 1824 const char *from = buf; 1825 char *to; 1826 size_t n, off; 1827 struct vm_mem_range *vmr; 1828 1829 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, dst, len); 1830 if (vmr == NULL) { 1831 errno = EINVAL; 1832 log_warn("%s: failed - invalid memory range dst = 0x%lx, " 1833 "len = 0x%zx", __func__, dst, len); 1834 return (EINVAL); 1835 } 1836 1837 off = dst - vmr->vmr_gpa; 1838 while (len != 0) { 1839 n = vmr->vmr_size - off; 1840 if (len < n) 1841 n = len; 1842 1843 to = (char *)vmr->vmr_va + off; 1844 if (buf == NULL) 1845 memset(to, 0, n); 1846 else { 1847 memcpy(to, from, n); 1848 from += n; 1849 } 1850 len -= n; 1851 off = 0; 1852 vmr++; 1853 } 1854 1855 return (0); 1856 } 1857 1858 /* 1859 * read_mem 1860 * 1861 * Reads memory at guest paddr 'src' into 'buf'. 1862 * 1863 * Parameters: 1864 * src: the source paddr_t in the guest VM to read from. 1865 * buf: destination (local) buffer 1866 * len: number of bytes to read 1867 * 1868 * Return values: 1869 * 0: success 1870 * EINVAL: if the guest physical memory range [dst, dst + len) does not 1871 * exist in the guest. 1872 */ 1873 int 1874 read_mem(paddr_t src, void *buf, size_t len) 1875 { 1876 char *from, *to = buf; 1877 size_t n, off; 1878 struct vm_mem_range *vmr; 1879 1880 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, src, len); 1881 if (vmr == NULL) { 1882 errno = EINVAL; 1883 log_warn("%s: failed - invalid memory range src = 0x%lx, " 1884 "len = 0x%zx", __func__, src, len); 1885 return (EINVAL); 1886 } 1887 1888 off = src - vmr->vmr_gpa; 1889 while (len != 0) { 1890 n = vmr->vmr_size - off; 1891 if (len < n) 1892 n = len; 1893 1894 from = (char *)vmr->vmr_va + off; 1895 memcpy(to, from, n); 1896 1897 to += n; 1898 len -= n; 1899 off = 0; 1900 vmr++; 1901 } 1902 1903 return (0); 1904 } 1905 1906 int 1907 iovec_mem(paddr_t src, size_t len, struct iovec *iov, int iovcnt) 1908 { 1909 size_t n, off; 1910 struct vm_mem_range *vmr; 1911 int niov = 0; 1912 1913 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, src, len); 1914 if (vmr == NULL) { 1915 errno = EINVAL; 1916 return (-1); 1917 } 1918 1919 off = src - vmr->vmr_gpa; 1920 while (len > 0) { 1921 if (niov == iovcnt) { 1922 errno = ENOMEM; 1923 return (-1); 1924 } 1925 1926 n = vmr->vmr_size - off; 1927 if (len < n) 1928 n = len; 1929 1930 iov[niov].iov_base = (char *)vmr->vmr_va + off; 1931 iov[niov].iov_len = n; 1932 1933 niov++; 1934 1935 len -= n; 1936 off = 0; 1937 vmr++; 1938 } 1939 1940 return (niov); 1941 } 1942 1943 /* 1944 * vcpu_assert_pic_irq 1945 * 1946 * Injects the specified IRQ on the supplied vcpu/vm 1947 * 1948 * Parameters: 1949 * vm_id: VM ID to inject to 1950 * vcpu_id: VCPU ID to inject to 1951 * irq: IRQ to inject 1952 */ 1953 void 1954 vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) 1955 { 1956 int ret; 1957 1958 i8259_assert_irq(irq); 1959 1960 if (i8259_is_pending()) { 1961 if (vcpu_pic_intr(vm_id, vcpu_id, 1)) 1962 fatalx("%s: can't assert INTR", __func__); 1963 1964 ret = pthread_mutex_lock(&vcpu_run_mtx[vcpu_id]); 1965 if (ret) 1966 fatalx("%s: can't lock vcpu mtx (%d)", __func__, ret); 1967 1968 vcpu_hlt[vcpu_id] = 0; 1969 ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]); 1970 if (ret) 1971 fatalx("%s: can't signal (%d)", __func__, ret); 1972 ret = pthread_mutex_unlock(&vcpu_run_mtx[vcpu_id]); 1973 if (ret) 1974 fatalx("%s: can't unlock vcpu mtx (%d)", __func__, ret); 1975 } 1976 } 1977 1978 /* 1979 * vcpu_deassert_pic_irq 1980 * 1981 * Clears the specified IRQ on the supplied vcpu/vm 1982 * 1983 * Parameters: 1984 * vm_id: VM ID to clear in 1985 * vcpu_id: VCPU ID to clear in 1986 * irq: IRQ to clear 1987 */ 1988 void 1989 vcpu_deassert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq) 1990 { 1991 i8259_deassert_irq(irq); 1992 1993 if (!i8259_is_pending()) { 1994 if (vcpu_pic_intr(vm_id, vcpu_id, 0)) 1995 fatalx("%s: can't deassert INTR for vm_id %d, " 1996 "vcpu_id %d", __func__, vm_id, vcpu_id); 1997 } 1998 } 1999 2000 /* 2001 * fd_hasdata 2002 * 2003 * Determines if data can be read from a file descriptor. 2004 * 2005 * Parameters: 2006 * fd: the fd to check 2007 * 2008 * Return values: 2009 * 1 if data can be read from an fd, or 0 otherwise. 2010 */ 2011 int 2012 fd_hasdata(int fd) 2013 { 2014 struct pollfd pfd[1]; 2015 int nready, hasdata = 0; 2016 2017 pfd[0].fd = fd; 2018 pfd[0].events = POLLIN; 2019 nready = poll(pfd, 1, 0); 2020 if (nready == -1) 2021 log_warn("checking file descriptor for data failed"); 2022 else if (nready == 1 && pfd[0].revents & POLLIN) 2023 hasdata = 1; 2024 return (hasdata); 2025 } 2026 2027 /* 2028 * mutex_lock 2029 * 2030 * Wrapper function for pthread_mutex_lock that does error checking and that 2031 * exits on failure 2032 */ 2033 void 2034 mutex_lock(pthread_mutex_t *m) 2035 { 2036 int ret; 2037 2038 ret = pthread_mutex_lock(m); 2039 if (ret) { 2040 errno = ret; 2041 fatal("could not acquire mutex"); 2042 } 2043 } 2044 2045 /* 2046 * mutex_unlock 2047 * 2048 * Wrapper function for pthread_mutex_unlock that does error checking and that 2049 * exits on failure 2050 */ 2051 void 2052 mutex_unlock(pthread_mutex_t *m) 2053 { 2054 int ret; 2055 2056 ret = pthread_mutex_unlock(m); 2057 if (ret) { 2058 errno = ret; 2059 fatal("could not release mutex"); 2060 } 2061 } 2062 2063 /* 2064 * set_return_data 2065 * 2066 * Utility function for manipulating register data in vm exit info structs. This 2067 * function ensures that the data is copied to the vei->vei.vei_data field with 2068 * the proper size for the operation being performed. 2069 * 2070 * Parameters: 2071 * vei: exit information 2072 * data: return data 2073 */ 2074 void 2075 set_return_data(struct vm_exit *vei, uint32_t data) 2076 { 2077 switch (vei->vei.vei_size) { 2078 case 1: 2079 vei->vei.vei_data &= ~0xFF; 2080 vei->vei.vei_data |= (uint8_t)data; 2081 break; 2082 case 2: 2083 vei->vei.vei_data &= ~0xFFFF; 2084 vei->vei.vei_data |= (uint16_t)data; 2085 break; 2086 case 4: 2087 vei->vei.vei_data = data; 2088 break; 2089 } 2090 } 2091 2092 /* 2093 * get_input_data 2094 * 2095 * Utility function for manipulating register data in vm exit info 2096 * structs. This function ensures that the data is copied from the 2097 * vei->vei.vei_data field with the proper size for the operation being 2098 * performed. 2099 * 2100 * Parameters: 2101 * vei: exit information 2102 * data: location to store the result 2103 */ 2104 void 2105 get_input_data(struct vm_exit *vei, uint32_t *data) 2106 { 2107 switch (vei->vei.vei_size) { 2108 case 1: 2109 *data &= 0xFFFFFF00; 2110 *data |= (uint8_t)vei->vei.vei_data; 2111 break; 2112 case 2: 2113 *data &= 0xFFFF0000; 2114 *data |= (uint16_t)vei->vei.vei_data; 2115 break; 2116 case 4: 2117 *data = vei->vei.vei_data; 2118 break; 2119 default: 2120 log_warnx("%s: invalid i/o size %d", __func__, 2121 vei->vei.vei_size); 2122 } 2123 2124 } 2125 2126 /* 2127 * translate_gva 2128 * 2129 * Translates a guest virtual address to a guest physical address by walking 2130 * the currently active page table (if needed). 2131 * 2132 * Note - this function can possibly alter the supplied VCPU state. 2133 * Specifically, it may inject exceptions depending on the current VCPU 2134 * configuration, and may alter %cr2 on #PF. Consequently, this function 2135 * should only be used as part of instruction emulation. 2136 * 2137 * Parameters: 2138 * exit: The VCPU this translation should be performed for (guest MMU settings 2139 * are gathered from this VCPU) 2140 * va: virtual address to translate 2141 * pa: pointer to paddr_t variable that will receive the translated physical 2142 * address. 'pa' is unchanged on error. 2143 * mode: one of PROT_READ, PROT_WRITE, PROT_EXEC indicating the mode in which 2144 * the address should be translated 2145 * 2146 * Return values: 2147 * 0: the address was successfully translated - 'pa' contains the physical 2148 * address currently mapped by 'va'. 2149 * EFAULT: the PTE for 'VA' is unmapped. A #PF will be injected in this case 2150 * and %cr2 set in the vcpu structure. 2151 * EINVAL: an error occurred reading paging table structures 2152 */ 2153 int 2154 translate_gva(struct vm_exit* exit, uint64_t va, uint64_t* pa, int mode) 2155 { 2156 int level, shift, pdidx; 2157 uint64_t pte, pt_paddr, pte_paddr, mask, low_mask, high_mask; 2158 uint64_t shift_width, pte_size; 2159 struct vcpu_reg_state *vrs; 2160 2161 vrs = &exit->vrs; 2162 2163 if (!pa) 2164 return (EINVAL); 2165 2166 if (!(vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PG)) { 2167 log_debug("%s: unpaged, va=pa=0x%llx", __func__, va); 2168 *pa = va; 2169 return (0); 2170 } 2171 2172 pt_paddr = vrs->vrs_crs[VCPU_REGS_CR3]; 2173 2174 log_debug("%s: guest %%cr0=0x%llx, %%cr3=0x%llx", __func__, 2175 vrs->vrs_crs[VCPU_REGS_CR0], vrs->vrs_crs[VCPU_REGS_CR3]); 2176 2177 if (vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PE) { 2178 if (vrs->vrs_crs[VCPU_REGS_CR4] & CR4_PAE) { 2179 pte_size = sizeof(uint64_t); 2180 shift_width = 9; 2181 2182 if (vrs->vrs_msrs[VCPU_REGS_EFER] & EFER_LMA) { 2183 /* 4 level paging */ 2184 level = 4; 2185 mask = L4_MASK; 2186 shift = L4_SHIFT; 2187 } else { 2188 /* 32 bit with PAE paging */ 2189 level = 3; 2190 mask = L3_MASK; 2191 shift = L3_SHIFT; 2192 } 2193 } else { 2194 /* 32 bit paging */ 2195 level = 2; 2196 shift_width = 10; 2197 mask = 0xFFC00000; 2198 shift = 22; 2199 pte_size = sizeof(uint32_t); 2200 } 2201 } else 2202 return (EINVAL); 2203 2204 /* XXX: Check for R bit in segment selector and set A bit */ 2205 2206 for (;level > 0; level--) { 2207 pdidx = (va & mask) >> shift; 2208 pte_paddr = (pt_paddr) + (pdidx * pte_size); 2209 2210 log_debug("%s: read pte level %d @ GPA 0x%llx", __func__, 2211 level, pte_paddr); 2212 if (read_mem(pte_paddr, &pte, pte_size)) { 2213 log_warn("%s: failed to read pte", __func__); 2214 return (EFAULT); 2215 } 2216 2217 log_debug("%s: PTE @ 0x%llx = 0x%llx", __func__, pte_paddr, 2218 pte); 2219 2220 /* XXX: Set CR2 */ 2221 if (!(pte & PG_V)) 2222 return (EFAULT); 2223 2224 /* XXX: Check for SMAP */ 2225 if ((mode == PROT_WRITE) && !(pte & PG_RW)) 2226 return (EPERM); 2227 2228 if ((exit->cpl > 0) && !(pte & PG_u)) 2229 return (EPERM); 2230 2231 pte = pte | PG_U; 2232 if (mode == PROT_WRITE) 2233 pte = pte | PG_M; 2234 if (write_mem(pte_paddr, &pte, pte_size)) { 2235 log_warn("%s: failed to write back flags to pte", 2236 __func__); 2237 return (EIO); 2238 } 2239 2240 /* XXX: EINVAL if in 32bit and PG_PS is 1 but CR4.PSE is 0 */ 2241 if (pte & PG_PS) 2242 break; 2243 2244 if (level > 1) { 2245 pt_paddr = pte & PG_FRAME; 2246 shift -= shift_width; 2247 mask = mask >> shift_width; 2248 } 2249 } 2250 2251 low_mask = (1 << shift) - 1; 2252 high_mask = (((uint64_t)1ULL << ((pte_size * 8) - 1)) - 1) ^ low_mask; 2253 *pa = (pte & high_mask) | (va & low_mask); 2254 2255 log_debug("%s: final GPA for GVA 0x%llx = 0x%llx\n", __func__, va, *pa); 2256 2257 return (0); 2258 } 2259 2260 /* 2261 * vm_pipe_init 2262 * 2263 * Initialize a vm_dev_pipe, setting up its file descriptors and its 2264 * event structure with the given callback. 2265 * 2266 * Parameters: 2267 * p: pointer to vm_dev_pipe struct to initizlize 2268 * cb: callback to use for READ events on the read end of the pipe 2269 */ 2270 void 2271 vm_pipe_init(struct vm_dev_pipe *p, void (*cb)(int, short, void *)) 2272 { 2273 int ret; 2274 int fds[2]; 2275 2276 memset(p, 0, sizeof(struct vm_dev_pipe)); 2277 2278 ret = pipe(fds); 2279 if (ret) 2280 fatal("failed to create vm_dev_pipe pipe"); 2281 2282 p->read = fds[0]; 2283 p->write = fds[1]; 2284 2285 event_set(&p->read_ev, p->read, EV_READ | EV_PERSIST, cb, NULL); 2286 } 2287 2288 /* 2289 * vm_pipe_send 2290 * 2291 * Send a message to an emulated device vie the provided vm_dev_pipe. 2292 * 2293 * Parameters: 2294 * p: pointer to initialized vm_dev_pipe 2295 * msg: message to send in the channel 2296 */ 2297 void 2298 vm_pipe_send(struct vm_dev_pipe *p, enum pipe_msg_type msg) 2299 { 2300 size_t n; 2301 n = write(p->write, &msg, sizeof(msg)); 2302 if (n != sizeof(msg)) 2303 fatal("failed to write to device pipe"); 2304 } 2305 2306 /* 2307 * vm_pipe_recv 2308 * 2309 * Receive a message for an emulated device via the provided vm_dev_pipe. 2310 * Returns the message value, otherwise will exit on failure. 2311 * 2312 * Parameters: 2313 * p: pointer to initialized vm_dev_pipe 2314 * 2315 * Return values: 2316 * a value of enum pipe_msg_type or fatal exit on read(2) error 2317 */ 2318 enum pipe_msg_type 2319 vm_pipe_recv(struct vm_dev_pipe *p) 2320 { 2321 size_t n; 2322 enum pipe_msg_type msg; 2323 n = read(p->read, &msg, sizeof(msg)); 2324 if (n != sizeof(msg)) 2325 fatal("failed to read from device pipe"); 2326 2327 return msg; 2328 } 2329