xref: /openbsd-src/usr.sbin/vmd/vmm.c (revision 99fd087599a8791921855f21bd7e36130f39aadc)
1 /*	$OpenBSD: vmm.c,v 1.95 2019/12/11 06:45:17 pd Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/param.h>	/* nitems */
20 #include <sys/ioctl.h>
21 #include <sys/queue.h>
22 #include <sys/wait.h>
23 #include <sys/uio.h>
24 #include <sys/socket.h>
25 #include <sys/time.h>
26 #include <sys/mman.h>
27 
28 #include <dev/ic/i8253reg.h>
29 #include <dev/isa/isareg.h>
30 #include <dev/pci/pcireg.h>
31 
32 #include <machine/param.h>
33 #include <machine/psl.h>
34 #include <machine/specialreg.h>
35 #include <machine/vmmvar.h>
36 
37 #include <net/if.h>
38 
39 #include <errno.h>
40 #include <event.h>
41 #include <fcntl.h>
42 #include <imsg.h>
43 #include <limits.h>
44 #include <poll.h>
45 #include <pthread.h>
46 #include <stddef.h>
47 #include <stdio.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #include <unistd.h>
51 #include <util.h>
52 
53 #include "vmd.h"
54 #include "vmm.h"
55 
56 void vmm_sighdlr(int, short, void *);
57 int vmm_start_vm(struct imsg *, uint32_t *, pid_t *);
58 int vmm_dispatch_parent(int, struct privsep_proc *, struct imsg *);
59 void vmm_run(struct privsep *, struct privsep_proc *, void *);
60 void vmm_dispatch_vm(int, short, void *);
61 int terminate_vm(struct vm_terminate_params *);
62 int get_info_vm(struct privsep *, struct imsg *, int);
63 int opentap(char *);
64 
65 extern struct vmd *env;
66 
67 static struct privsep_proc procs[] = {
68 	{ "parent",	PROC_PARENT,	vmm_dispatch_parent  },
69 };
70 
71 void
72 vmm(struct privsep *ps, struct privsep_proc *p)
73 {
74 	proc_run(ps, p, procs, nitems(procs), vmm_run, NULL);
75 }
76 
77 void
78 vmm_run(struct privsep *ps, struct privsep_proc *p, void *arg)
79 {
80 	if (config_init(ps->ps_env) == -1)
81 		fatal("failed to initialize configuration");
82 
83 	signal_del(&ps->ps_evsigchld);
84 	signal_set(&ps->ps_evsigchld, SIGCHLD, vmm_sighdlr, ps);
85 	signal_add(&ps->ps_evsigchld, NULL);
86 
87 	/*
88 	 * pledge in the vmm process:
89 	 * stdio - for malloc and basic I/O including events.
90 	 * vmm - for the vmm ioctls and operations.
91 	 * proc - for forking and maitaining vms.
92 	 * send - for sending send/recv fds to vm proc.
93 	 * recvfd - for disks, interfaces and other fds.
94 	 */
95 	if (pledge("stdio vmm sendfd recvfd proc", NULL) == -1)
96 		fatal("pledge");
97 
98 	/* Get and terminate all running VMs */
99 	get_info_vm(ps, NULL, 1);
100 }
101 
102 int
103 vmm_dispatch_parent(int fd, struct privsep_proc *p, struct imsg *imsg)
104 {
105 	struct privsep		*ps = p->p_ps;
106 	int			 res = 0, cmd = 0, verbose, ret;
107 	struct vmd_vm		*vm = NULL;
108 	struct vm_terminate_params vtp;
109 	struct vmop_id		 vid;
110 	struct vmop_result	 vmr;
111 	struct vmop_create_params vmc;
112 	uint32_t		 id = 0, peerid = imsg->hdr.peerid;
113 	pid_t			 pid = 0;
114 	unsigned int		 mode, flags;
115 
116 	switch (imsg->hdr.type) {
117 	case IMSG_VMDOP_START_VM_REQUEST:
118 		res = config_getvm(ps, imsg);
119 		if (res == -1) {
120 			res = errno;
121 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
122 		}
123 		break;
124 	case IMSG_VMDOP_START_VM_CDROM:
125 		res = config_getcdrom(ps, imsg);
126 		if (res == -1) {
127 			res = errno;
128 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
129 		}
130 		break;
131 	case IMSG_VMDOP_START_VM_DISK:
132 		res = config_getdisk(ps, imsg);
133 		if (res == -1) {
134 			res = errno;
135 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
136 		}
137 		break;
138 	case IMSG_VMDOP_START_VM_IF:
139 		res = config_getif(ps, imsg);
140 		if (res == -1) {
141 			res = errno;
142 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
143 		}
144 		break;
145 	case IMSG_VMDOP_START_VM_END:
146 		res = vmm_start_vm(imsg, &id, &pid);
147 		/* Check if the ID can be mapped correctly */
148 		if ((id = vm_id2vmid(id, NULL)) == 0)
149 			res = ENOENT;
150 		cmd = IMSG_VMDOP_START_VM_RESPONSE;
151 		break;
152 	case IMSG_VMDOP_WAIT_VM_REQUEST:
153 		IMSG_SIZE_CHECK(imsg, &vid);
154 		memcpy(&vid, imsg->data, sizeof(vid));
155 		id = vid.vid_id;
156 
157 		DPRINTF("%s: recv'ed WAIT_VM for %d", __func__, id);
158 
159 		cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
160 		if (id == 0) {
161 			res = ENOENT;
162 		} else if ((vm = vm_getbyvmid(id)) != NULL) {
163 			if (vm->vm_peerid != (uint32_t)-1) {
164 				peerid = vm->vm_peerid;
165 				res = EINTR;
166 			} else
167 				cmd = 0;
168 			vm->vm_peerid = imsg->hdr.peerid;
169 		} else {
170 			/* vm doesn't exist, cannot stop vm */
171 			log_debug("%s: cannot stop vm that is not running",
172 			    __func__);
173 			res = VMD_VM_STOP_INVALID;
174 		}
175 		break;
176 	case IMSG_VMDOP_TERMINATE_VM_REQUEST:
177 		IMSG_SIZE_CHECK(imsg, &vid);
178 		memcpy(&vid, imsg->data, sizeof(vid));
179 		id = vid.vid_id;
180 		flags = vid.vid_flags;
181 
182 		DPRINTF("%s: recv'ed TERMINATE_VM for %d", __func__, id);
183 
184 		cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
185 
186 		if (id == 0) {
187 			res = ENOENT;
188 		} else if ((vm = vm_getbyvmid(id)) != NULL) {
189 			if (flags & VMOP_FORCE) {
190 				vtp.vtp_vm_id = vm_vmid2id(vm->vm_vmid, vm);
191 				vm->vm_state |= VM_STATE_SHUTDOWN;
192 				(void)terminate_vm(&vtp);
193 				res = 0;
194 			} else if (!(vm->vm_state & VM_STATE_SHUTDOWN)) {
195 				log_debug("%s: sending shutdown request"
196 				    " to vm %d", __func__, id);
197 
198 				/*
199 				 * Request reboot but mark the VM as shutting
200 				 * down. This way we can terminate the VM after
201 				 * the triple fault instead of reboot and
202 				 * avoid being stuck in the ACPI-less powerdown
203 				 * ("press any key to reboot") of the VM.
204 				 */
205 				vm->vm_state |= VM_STATE_SHUTDOWN;
206 				if (imsg_compose_event(&vm->vm_iev,
207 				    IMSG_VMDOP_VM_REBOOT,
208 				    0, 0, -1, NULL, 0) == -1)
209 					res = errno;
210 				else
211 					res = 0;
212 			} else {
213 				/*
214 				 * VM is currently being shutdown.
215 				 * Check to see if the VM process is still
216 				 * active.  If not, return VMD_VM_STOP_INVALID.
217 				 */
218 				if (vm_vmid2id(vm->vm_vmid, vm) == 0) {
219 					log_debug("%s: no vm running anymore",
220 					    __func__);
221 					res = VMD_VM_STOP_INVALID;
222 				}
223 			}
224 			if ((flags & VMOP_WAIT) &&
225 			    res == 0 && (vm->vm_state & VM_STATE_SHUTDOWN)) {
226 				if (vm->vm_peerid != (uint32_t)-1) {
227 					peerid = vm->vm_peerid;
228 					res = EINTR;
229 				} else
230 					cmd = 0;
231 				vm->vm_peerid = imsg->hdr.peerid;
232 			}
233 		} else {
234 			/* vm doesn't exist, cannot stop vm */
235 			log_debug("%s: cannot stop vm that is not running",
236 			    __func__);
237 			res = VMD_VM_STOP_INVALID;
238 		}
239 		break;
240 	case IMSG_VMDOP_GET_INFO_VM_REQUEST:
241 		res = get_info_vm(ps, imsg, 0);
242 		cmd = IMSG_VMDOP_GET_INFO_VM_END_DATA;
243 		break;
244 	case IMSG_VMDOP_CONFIG:
245 		config_getconfig(env, imsg);
246 		break;
247 	case IMSG_CTL_RESET:
248 		IMSG_SIZE_CHECK(imsg, &mode);
249 		memcpy(&mode, imsg->data, sizeof(mode));
250 
251 		if (mode & CONFIG_VMS) {
252 			/* Terminate and remove all VMs */
253 			vmm_shutdown();
254 			mode &= ~CONFIG_VMS;
255 		}
256 
257 		config_getreset(env, imsg);
258 		break;
259 	case IMSG_CTL_VERBOSE:
260 		IMSG_SIZE_CHECK(imsg, &verbose);
261 		memcpy(&verbose, imsg->data, sizeof(verbose));
262 		log_setverbose(verbose);
263 
264 		/* Forward message to each VM process */
265 		TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
266 			imsg_compose_event(&vm->vm_iev,
267 			    imsg->hdr.type, imsg->hdr.peerid, imsg->hdr.pid,
268 			    -1, &verbose, sizeof(verbose));
269 		}
270 		break;
271 	case IMSG_VMDOP_PAUSE_VM:
272 		IMSG_SIZE_CHECK(imsg, &vid);
273 		memcpy(&vid, imsg->data, sizeof(vid));
274 		id = vid.vid_id;
275 		vm = vm_getbyvmid(id);
276 		if ((vm = vm_getbyvmid(id)) == NULL) {
277 			res = ENOENT;
278 			cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE;
279 			break;
280 		}
281 		imsg_compose_event(&vm->vm_iev,
282 		    imsg->hdr.type, imsg->hdr.peerid, imsg->hdr.pid,
283 		    imsg->fd, &vid, sizeof(vid));
284 		break;
285 	case IMSG_VMDOP_UNPAUSE_VM:
286 		IMSG_SIZE_CHECK(imsg, &vid);
287 		memcpy(&vid, imsg->data, sizeof(vid));
288 		id = vid.vid_id;
289 		if ((vm = vm_getbyvmid(id)) == NULL) {
290 			res = ENOENT;
291 			cmd = IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
292 			break;
293 		}
294 		imsg_compose_event(&vm->vm_iev,
295 		    imsg->hdr.type, imsg->hdr.peerid, imsg->hdr.pid,
296 		    imsg->fd, &vid, sizeof(vid));
297 		break;
298 	case IMSG_VMDOP_SEND_VM_REQUEST:
299 		IMSG_SIZE_CHECK(imsg, &vid);
300 		memcpy(&vid, imsg->data, sizeof(vid));
301 		id = vid.vid_id;
302 		if ((vm = vm_getbyvmid(id)) == NULL) {
303 			res = ENOENT;
304 			close(imsg->fd);
305 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
306 			break;
307 		}
308 		imsg_compose_event(&vm->vm_iev,
309 		    imsg->hdr.type, imsg->hdr.peerid, imsg->hdr.pid,
310 		    imsg->fd, &vid, sizeof(vid));
311 		break;
312 	case IMSG_VMDOP_RECEIVE_VM_REQUEST:
313 		IMSG_SIZE_CHECK(imsg, &vmc);
314 		memcpy(&vmc, imsg->data, sizeof(vmc));
315 		ret = vm_register(ps, &vmc, &vm,
316 		    imsg->hdr.peerid, vmc.vmc_owner.uid);
317 		vm->vm_tty = imsg->fd;
318 		vm->vm_state |= VM_STATE_RECEIVED;
319 		vm->vm_state |= VM_STATE_PAUSED;
320 		break;
321 	case IMSG_VMDOP_RECEIVE_VM_END:
322 		if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) {
323 			res = ENOENT;
324 			close(imsg->fd);
325 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
326 			break;
327 		}
328 		vm->vm_receive_fd = imsg->fd;
329 		res = vmm_start_vm(imsg, &id, &pid);
330 		/* Check if the ID can be mapped correctly */
331 		if ((id = vm_id2vmid(id, NULL)) == 0)
332 			res = ENOENT;
333 		cmd = IMSG_VMDOP_START_VM_RESPONSE;
334 		break;
335 	default:
336 		return (-1);
337 	}
338 
339 	switch (cmd) {
340 	case 0:
341 		break;
342 	case IMSG_VMDOP_START_VM_RESPONSE:
343 		if (res != 0) {
344 			/* Remove local reference if it exists */
345 			if ((vm = vm_getbyvmid(imsg->hdr.peerid)) != NULL) {
346 				log_debug("%s: removing vm, START_VM_RESPONSE",
347 				    __func__);
348 				vm_remove(vm, __func__);
349 			}
350 		}
351 		if (id == 0)
352 			id = imsg->hdr.peerid;
353 	case IMSG_VMDOP_PAUSE_VM_RESPONSE:
354 	case IMSG_VMDOP_UNPAUSE_VM_RESPONSE:
355 	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
356 		memset(&vmr, 0, sizeof(vmr));
357 		vmr.vmr_result = res;
358 		vmr.vmr_id = id;
359 		vmr.vmr_pid = pid;
360 		if (proc_compose_imsg(ps, PROC_PARENT, -1, cmd,
361 		    peerid, -1, &vmr, sizeof(vmr)) == -1)
362 			return (-1);
363 		break;
364 	default:
365 		if (proc_compose_imsg(ps, PROC_PARENT, -1, cmd,
366 		    peerid, -1, &res, sizeof(res)) == -1)
367 			return (-1);
368 		break;
369 	}
370 
371 	return (0);
372 }
373 
374 void
375 vmm_sighdlr(int sig, short event, void *arg)
376 {
377 	struct privsep *ps = arg;
378 	int status, ret = 0;
379 	uint32_t vmid;
380 	pid_t pid;
381 	struct vmop_result vmr;
382 	struct vmd_vm *vm;
383 	struct vm_terminate_params vtp;
384 
385 	log_debug("%s: handling signal %d", __func__, sig);
386 	switch (sig) {
387 	case SIGCHLD:
388 		do {
389 			pid = waitpid(-1, &status, WNOHANG);
390 			if (pid <= 0)
391 				continue;
392 
393 			if (WIFEXITED(status) || WIFSIGNALED(status)) {
394 				vm = vm_getbypid(pid);
395 				if (vm == NULL) {
396 					/*
397 					 * If the VM is gone already, it
398 					 * got terminated via a
399 					 * IMSG_VMDOP_TERMINATE_VM_REQUEST.
400 					 */
401 					continue;
402 				}
403 
404 				if (WIFEXITED(status))
405 					ret = WEXITSTATUS(status);
406 
407 				/* don't reboot on pending shutdown */
408 				if (ret == EAGAIN && (vm->vm_state & VM_STATE_SHUTDOWN))
409 					ret = 0;
410 
411 				vmid = vm->vm_params.vmc_params.vcp_id;
412 				vtp.vtp_vm_id = vmid;
413 
414 				if (terminate_vm(&vtp) == 0)
415 					log_debug("%s: terminated vm %s"
416 					    " (id %d)", __func__,
417 					    vm->vm_params.vmc_params.vcp_name,
418 					    vm->vm_vmid);
419 
420 				memset(&vmr, 0, sizeof(vmr));
421 				vmr.vmr_result = ret;
422 				vmr.vmr_id = vm_id2vmid(vmid, vm);
423 				if (proc_compose_imsg(ps, PROC_PARENT,
424 				    -1, IMSG_VMDOP_TERMINATE_VM_EVENT,
425 				    vm->vm_peerid, -1,
426 				    &vmr, sizeof(vmr)) == -1)
427 					log_warnx("could not signal "
428 					    "termination of VM %u to "
429 					    "parent", vm->vm_vmid);
430 
431 				vm_remove(vm, __func__);
432 			} else
433 				fatalx("unexpected cause of SIGCHLD");
434 		} while (pid > 0 || (pid == -1 && errno == EINTR));
435 		break;
436 	default:
437 		fatalx("unexpected signal");
438 	}
439 }
440 
441 /*
442  * vmm_shutdown
443  *
444  * Terminate VMs on shutdown to avoid "zombie VM" processes.
445  */
446 void
447 vmm_shutdown(void)
448 {
449 	struct vm_terminate_params vtp;
450 	struct vmd_vm *vm, *vm_next;
451 
452 	TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry, vm_next) {
453 		vtp.vtp_vm_id = vm_vmid2id(vm->vm_vmid, vm);
454 
455 		/* XXX suspend or request graceful shutdown */
456 		(void)terminate_vm(&vtp);
457 		vm_remove(vm, __func__);
458 	}
459 }
460 
461 /*
462  * vmm_pipe
463  *
464  * Create a new imsg control channel between vmm parent and a VM
465  * (can be called on both sides).
466  */
467 int
468 vmm_pipe(struct vmd_vm *vm, int fd, void (*cb)(int, short, void *))
469 {
470 	struct imsgev	*iev = &vm->vm_iev;
471 
472 	if (fcntl(fd, F_SETFL, O_NONBLOCK) == -1) {
473 		log_warn("failed to set nonblocking mode on vm pipe");
474 		return (-1);
475 	}
476 
477 	imsg_init(&iev->ibuf, fd);
478 	iev->handler = cb;
479 	iev->data = vm;
480 	imsg_event_add(iev);
481 
482 	return (0);
483 }
484 
485 /*
486  * vmm_dispatch_vm
487  *
488  * imsg callback for messages that are received from a VM child process.
489  */
490 void
491 vmm_dispatch_vm(int fd, short event, void *arg)
492 {
493 	struct vmd_vm		*vm = arg;
494 	struct vmop_result	 vmr;
495 	struct imsgev		*iev = &vm->vm_iev;
496 	struct imsgbuf		*ibuf = &iev->ibuf;
497 	struct imsg		 imsg;
498 	ssize_t			 n;
499 	unsigned int		 i;
500 
501 	if (event & EV_READ) {
502 		if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
503 			fatal("%s: imsg_read", __func__);
504 		if (n == 0) {
505 			/* this pipe is dead, so remove the event handler */
506 			event_del(&iev->ev);
507 			return;
508 		}
509 	}
510 
511 	if (event & EV_WRITE) {
512 		if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
513 			fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd);
514 		if (n == 0) {
515 			/* this pipe is dead, so remove the event handler */
516 			event_del(&iev->ev);
517 			return;
518 		}
519 	}
520 
521 	for (;;) {
522 		if ((n = imsg_get(ibuf, &imsg)) == -1)
523 			fatal("%s: imsg_get", __func__);
524 		if (n == 0)
525 			break;
526 
527 		DPRINTF("%s: got imsg %d from %s",
528 		    __func__, imsg.hdr.type,
529 		    vm->vm_params.vmc_params.vcp_name);
530 
531 		switch (imsg.hdr.type) {
532 		case IMSG_VMDOP_VM_SHUTDOWN:
533 			vm->vm_state |= VM_STATE_SHUTDOWN;
534 			break;
535 		case IMSG_VMDOP_VM_REBOOT:
536 			vm->vm_state &= ~VM_STATE_SHUTDOWN;
537 			break;
538 		case IMSG_VMDOP_SEND_VM_RESPONSE:
539 			IMSG_SIZE_CHECK(&imsg, &vmr);
540 			memcpy(&vmr, imsg.data, sizeof(vmr));
541 			if (!vmr.vmr_result) {
542 				vm_remove(vm, __func__);
543 			}
544 		case IMSG_VMDOP_PAUSE_VM_RESPONSE:
545 		case IMSG_VMDOP_UNPAUSE_VM_RESPONSE:
546 			for (i = 0; i < sizeof(procs); i++) {
547 				if (procs[i].p_id == PROC_PARENT) {
548 					proc_forward_imsg(procs[i].p_ps,
549 					    &imsg, PROC_PARENT, -1);
550 					break;
551 				}
552 			}
553 			break;
554 
555 		default:
556 			fatalx("%s: got invalid imsg %d from %s",
557 			    __func__, imsg.hdr.type,
558 			    vm->vm_params.vmc_params.vcp_name);
559 		}
560 		imsg_free(&imsg);
561 	}
562 	imsg_event_add(iev);
563 }
564 
565 /*
566  * terminate_vm
567  *
568  * Requests vmm(4) to terminate the VM whose ID is provided in the
569  * supplied vm_terminate_params structure (vtp->vtp_vm_id)
570  *
571  * Parameters
572  *  vtp: vm_terminate_params struct containing the ID of the VM to terminate
573  *
574  * Return values:
575  *  0: success
576  *  !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM is not
577  *      valid)
578  */
579 int
580 terminate_vm(struct vm_terminate_params *vtp)
581 {
582 	if (ioctl(env->vmd_fd, VMM_IOC_TERM, vtp) == -1)
583 		return (errno);
584 
585 	return (0);
586 }
587 
588 /*
589  * opentap
590  *
591  * Opens the next available tap device, up to MAX_TAP.
592  *
593  * Parameters
594  *  ifname: a buffer of at least IF_NAMESIZE bytes.
595  *
596  * Returns a file descriptor to the tap node opened, or -1 if no tap
597  * devices were available.
598  */
599 int
600 opentap(char *ifname)
601 {
602 	int i, fd;
603 	char path[PATH_MAX];
604 
605 	for (i = 0; i < MAX_TAP; i++) {
606 		snprintf(path, PATH_MAX, "/dev/tap%d", i);
607 		fd = open(path, O_RDWR | O_NONBLOCK);
608 		if (fd != -1) {
609 			snprintf(ifname, IF_NAMESIZE, "tap%d", i);
610 			return (fd);
611 		}
612 	}
613 	strlcpy(ifname, "tap", IF_NAMESIZE);
614 
615 	return (-1);
616 }
617 
618 /*
619  * vmm_start_vm
620  *
621  * Prepares and forks a new VM process.
622  *
623  * Parameters:
624  *  imsg: The VM data structure that is including the VM create parameters.
625  *  id: Returns the VM id as reported by the kernel and obtained from the VM.
626  *  pid: Returns the VM pid to the parent.
627  *
628  * Return values:
629  *  0: success
630  *  !0 : failure - typically an errno indicating the source of the failure
631  */
632 int
633 vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid)
634 {
635 	struct vm_create_params	*vcp;
636 	struct vmd_vm		*vm;
637 	int			 ret = EINVAL;
638 	int			 fds[2];
639 	size_t			 i, j;
640 
641 	if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) {
642 		log_warnx("%s: can't find vm", __func__);
643 		ret = ENOENT;
644 		goto err;
645 	}
646 	vcp = &vm->vm_params.vmc_params;
647 
648 	if (!(vm->vm_state & VM_STATE_RECEIVED)) {
649 		if ((vm->vm_tty = imsg->fd) == -1) {
650 			log_warnx("%s: can't get tty", __func__);
651 			goto err;
652 		}
653 	}
654 
655 	if (socketpair(AF_UNIX, SOCK_STREAM, PF_UNSPEC, fds) == -1)
656 		fatal("socketpair");
657 
658 	/* Start child vmd for this VM (fork, chroot, drop privs) */
659 	ret = fork();
660 
661 	/* Start child failed? - cleanup and leave */
662 	if (ret == -1) {
663 		log_warnx("%s: start child failed", __func__);
664 		ret = EIO;
665 		goto err;
666 	}
667 
668 	if (ret > 0) {
669 		/* Parent */
670 		vm->vm_pid = ret;
671 		close(fds[1]);
672 
673 		for (i = 0 ; i < vcp->vcp_ndisks; i++) {
674 			for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
675 				if (vm->vm_disks[i][j] != -1)
676 					close(vm->vm_disks[i][j]);
677 				vm->vm_disks[i][j] = -1;
678 			}
679 		}
680 		for (i = 0 ; i < vcp->vcp_nnics; i++) {
681 			close(vm->vm_ifs[i].vif_fd);
682 			vm->vm_ifs[i].vif_fd = -1;
683 		}
684 		if (vm->vm_kernel != -1) {
685 			close(vm->vm_kernel);
686 			vm->vm_kernel = -1;
687 		}
688 		if (vm->vm_cdrom != -1) {
689 			close(vm->vm_cdrom);
690 			vm->vm_cdrom = -1;
691 		}
692 		if (vm->vm_tty != -1) {
693 			close(vm->vm_tty);
694 			vm->vm_tty = -1;
695 		}
696 
697 		/* read back the kernel-generated vm id from the child */
698 		if (read(fds[0], &vcp->vcp_id, sizeof(vcp->vcp_id)) !=
699 		    sizeof(vcp->vcp_id))
700 			fatal("read vcp id");
701 
702 		if (vcp->vcp_id == 0)
703 			goto err;
704 
705 		*id = vcp->vcp_id;
706 		*pid = vm->vm_pid;
707 
708 		if (vmm_pipe(vm, fds[0], vmm_dispatch_vm) == -1)
709 			fatal("setup vm pipe");
710 
711 		return (0);
712 	} else {
713 		/* Child */
714 		close(fds[0]);
715 		close(PROC_PARENT_SOCK_FILENO);
716 
717 		ret = start_vm(vm, fds[1]);
718 
719 		_exit(ret);
720 	}
721 
722 	return (0);
723 
724  err:
725 	vm_remove(vm, __func__);
726 
727 	return (ret);
728 }
729 
730 /*
731  * get_info_vm
732  *
733  * Returns a list of VMs known to vmm(4).
734  *
735  * Parameters:
736  *  ps: the privsep context.
737  *  imsg: the received imsg including the peer id.
738  *  terminate: terminate the listed vm.
739  *
740  * Return values:
741  *  0: success
742  *  !0 : failure (eg, ENOMEM, EIO or another error code from vmm(4) ioctl)
743  */
744 int
745 get_info_vm(struct privsep *ps, struct imsg *imsg, int terminate)
746 {
747 	int ret;
748 	size_t ct, i;
749 	struct vm_info_params vip;
750 	struct vm_info_result *info;
751 	struct vm_terminate_params vtp;
752 	struct vmop_info_result vir;
753 
754 	/*
755 	 * We issue the VMM_IOC_INFO ioctl twice, once with an input
756 	 * buffer size of 0, which results in vmm(4) returning the
757 	 * number of bytes required back to us in vip.vip_size,
758 	 * and then we call it again after malloc'ing the required
759 	 * number of bytes.
760 	 *
761 	 * It is possible that we could fail a second time (eg, if
762 	 * another VM was created in the instant between the two
763 	 * ioctls, but in that case the caller can just try again
764 	 * as vmm(4) will return a zero-sized list in that case.
765 	 */
766 	vip.vip_size = 0;
767 	info = NULL;
768 	ret = 0;
769 	memset(&vir, 0, sizeof(vir));
770 
771 	/* First ioctl to see how many bytes needed (vip.vip_size) */
772 	if (ioctl(env->vmd_fd, VMM_IOC_INFO, &vip) == -1)
773 		return (errno);
774 
775 	if (vip.vip_info_ct != 0)
776 		return (EIO);
777 
778 	info = malloc(vip.vip_size);
779 	if (info == NULL)
780 		return (ENOMEM);
781 
782 	/* Second ioctl to get the actual list */
783 	vip.vip_info = info;
784 	if (ioctl(env->vmd_fd, VMM_IOC_INFO, &vip) == -1) {
785 		ret = errno;
786 		free(info);
787 		return (ret);
788 	}
789 
790 	/* Return info */
791 	ct = vip.vip_size / sizeof(struct vm_info_result);
792 	for (i = 0; i < ct; i++) {
793 		if (terminate) {
794 			vtp.vtp_vm_id = info[i].vir_id;
795 			if ((ret = terminate_vm(&vtp)) != 0)
796 				return (ret);
797 			log_debug("%s: terminated vm %s (id %d)", __func__,
798 			    info[i].vir_name, info[i].vir_id);
799 			continue;
800 		}
801 		memcpy(&vir.vir_info, &info[i], sizeof(vir.vir_info));
802 		vir.vir_info.vir_id = vm_id2vmid(info[i].vir_id, NULL);
803 		if (proc_compose_imsg(ps, PROC_PARENT, -1,
804 		    IMSG_VMDOP_GET_INFO_VM_DATA, imsg->hdr.peerid, -1,
805 		    &vir, sizeof(vir)) == -1)
806 			return (EIO);
807 	}
808 	free(info);
809 	return (0);
810 }
811