xref: /openbsd-src/usr.sbin/vmd/vmm.c (revision 0a9d031fce78c0ebce0995b311938b1c87b1e208)
1 /*	$OpenBSD: vmm.c,v 1.130 2024/11/21 13:39:34 claudio Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 #include <sys/ioctl.h>
21 #include <sys/queue.h>
22 #include <sys/wait.h>
23 #include <sys/socket.h>
24 
25 #include <dev/vmm/vmm.h>
26 
27 #include <net/if.h>
28 
29 #include <errno.h>
30 #include <event.h>
31 #include <fcntl.h>
32 #include <imsg.h>
33 #include <limits.h>
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <string.h>
37 #include <unistd.h>
38 
39 #include "vmd.h"
40 #include "atomicio.h"
41 #include "proc.h"
42 
43 void	vmm_sighdlr(int, short, void *);
44 int	vmm_start_vm(struct imsg *, uint32_t *, pid_t *);
45 int	vmm_dispatch_parent(int, struct privsep_proc *, struct imsg *);
46 void	vmm_run(struct privsep *, struct privsep_proc *, void *);
47 void	vmm_dispatch_vm(int, short, void *);
48 int	terminate_vm(struct vm_terminate_params *);
49 int	get_info_vm(struct privsep *, struct imsg *, int);
50 int	opentap(char *);
51 
52 extern struct vmd *env;
53 
54 static struct privsep_proc procs[] = {
55 	{ "parent",	PROC_PARENT,	vmm_dispatch_parent  },
56 };
57 
58 void
59 vmm(struct privsep *ps, struct privsep_proc *p)
60 {
61 	proc_run(ps, p, procs, nitems(procs), vmm_run, NULL);
62 }
63 
64 void
65 vmm_run(struct privsep *ps, struct privsep_proc *p, void *arg)
66 {
67 	if (config_init(ps->ps_env) == -1)
68 		fatal("failed to initialize configuration");
69 
70 	/*
71 	 * We aren't root, so we can't chroot(2). Use unveil(2) instead.
72 	 */
73 	if (unveil(env->argv0, "x") == -1)
74 		fatal("unveil %s", env->argv0);
75 	if (unveil(NULL, NULL) == -1)
76 		fatal("unveil lock");
77 
78 	/*
79 	 * pledge in the vmm process:
80 	 * stdio - for malloc and basic I/O including events.
81 	 * vmm - for the vmm ioctls and operations.
82 	 * proc, exec - for forking and execing new vm's.
83 	 * sendfd - for sending send/recv fds to vm proc.
84 	 * recvfd - for disks, interfaces and other fds.
85 	 */
86 	if (pledge("stdio vmm sendfd recvfd proc exec", NULL) == -1)
87 		fatal("pledge");
88 
89 	signal_del(&ps->ps_evsigchld);
90 	signal_set(&ps->ps_evsigchld, SIGCHLD, vmm_sighdlr, ps);
91 	signal_add(&ps->ps_evsigchld, NULL);
92 }
93 
94 int
95 vmm_dispatch_parent(int fd, struct privsep_proc *p, struct imsg *imsg)
96 {
97 	struct privsep		*ps = p->p_ps;
98 	int			 res = 0, cmd = 0, verbose;
99 	struct vmd_vm		*vm = NULL;
100 	struct vm_terminate_params vtp;
101 	struct vmop_id		 vid;
102 	struct vmop_result	 vmr;
103 	struct vmop_create_params vmc;
104 	struct vmop_addr_result  var;
105 	uint32_t		 id = 0, peerid = imsg->hdr.peerid;
106 	pid_t			 pid = 0;
107 	unsigned int		 mode, flags;
108 
109 	switch (imsg->hdr.type) {
110 	case IMSG_VMDOP_START_VM_REQUEST:
111 		res = config_getvm(ps, imsg);
112 		if (res == -1) {
113 			res = errno;
114 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
115 		}
116 		break;
117 	case IMSG_VMDOP_START_VM_CDROM:
118 		res = config_getcdrom(ps, imsg);
119 		if (res == -1) {
120 			res = errno;
121 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
122 		}
123 		break;
124 	case IMSG_VMDOP_START_VM_DISK:
125 		res = config_getdisk(ps, imsg);
126 		if (res == -1) {
127 			res = errno;
128 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
129 		}
130 		break;
131 	case IMSG_VMDOP_START_VM_IF:
132 		res = config_getif(ps, imsg);
133 		if (res == -1) {
134 			res = errno;
135 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
136 		}
137 		break;
138 	case IMSG_VMDOP_START_VM_END:
139 		res = vmm_start_vm(imsg, &id, &pid);
140 		/* Check if the ID can be mapped correctly */
141 		if (res == 0 && (id = vm_id2vmid(id, NULL)) == 0)
142 			res = ENOENT;
143 		cmd = IMSG_VMDOP_START_VM_RESPONSE;
144 		break;
145 	case IMSG_VMDOP_TERMINATE_VM_REQUEST:
146 		IMSG_SIZE_CHECK(imsg, &vid);
147 		memcpy(&vid, imsg->data, sizeof(vid));
148 		id = vid.vid_id;
149 		flags = vid.vid_flags;
150 
151 		DPRINTF("%s: recv'ed TERMINATE_VM for %d", __func__, id);
152 
153 		cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
154 
155 		if (id == 0) {
156 			res = ENOENT;
157 		} else if ((vm = vm_getbyvmid(id)) != NULL) {
158 			if (flags & VMOP_FORCE) {
159 				vtp.vtp_vm_id = vm_vmid2id(vm->vm_vmid, vm);
160 				vm->vm_state |= VM_STATE_SHUTDOWN;
161 				(void)terminate_vm(&vtp);
162 				res = 0;
163 			} else if (!(vm->vm_state & VM_STATE_SHUTDOWN)) {
164 				log_debug("%s: sending shutdown request"
165 				    " to vm %d", __func__, id);
166 
167 				/*
168 				 * Request reboot but mark the VM as shutting
169 				 * down. This way we can terminate the VM after
170 				 * the triple fault instead of reboot and
171 				 * avoid being stuck in the ACPI-less powerdown
172 				 * ("press any key to reboot") of the VM.
173 				 */
174 				vm->vm_state |= VM_STATE_SHUTDOWN;
175 				if (imsg_compose_event(&vm->vm_iev,
176 				    IMSG_VMDOP_VM_REBOOT,
177 				    0, 0, -1, NULL, 0) == -1)
178 					res = errno;
179 				else
180 					res = 0;
181 			} else {
182 				/*
183 				 * VM is currently being shutdown.
184 				 * Check to see if the VM process is still
185 				 * active.  If not, return VMD_VM_STOP_INVALID.
186 				 */
187 				if (vm_vmid2id(vm->vm_vmid, vm) == 0) {
188 					log_debug("%s: no vm running anymore",
189 					    __func__);
190 					res = VMD_VM_STOP_INVALID;
191 				}
192 			}
193 		} else {
194 			/* VM doesn't exist, cannot stop vm */
195 			log_debug("%s: cannot stop vm that is not running",
196 			    __func__);
197 			res = VMD_VM_STOP_INVALID;
198 		}
199 		break;
200 	case IMSG_VMDOP_GET_INFO_VM_REQUEST:
201 		res = get_info_vm(ps, imsg, 0);
202 		cmd = IMSG_VMDOP_GET_INFO_VM_END_DATA;
203 		break;
204 	case IMSG_VMDOP_CONFIG:
205 		config_getconfig(env, imsg);
206 		break;
207 	case IMSG_CTL_RESET:
208 		IMSG_SIZE_CHECK(imsg, &mode);
209 		memcpy(&mode, imsg->data, sizeof(mode));
210 
211 		if (mode & CONFIG_VMS) {
212 			/* Terminate and remove all VMs */
213 			vmm_shutdown();
214 			mode &= ~CONFIG_VMS;
215 		}
216 
217 		config_getreset(env, imsg);
218 		break;
219 	case IMSG_CTL_VERBOSE:
220 		IMSG_SIZE_CHECK(imsg, &verbose);
221 		memcpy(&verbose, imsg->data, sizeof(verbose));
222 		log_setverbose(verbose);
223 		env->vmd_verbose = verbose;
224 		/* Forward message to each VM process */
225 		TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
226 			imsg_compose_event(&vm->vm_iev,
227 			    imsg->hdr.type, imsg->hdr.peerid, imsg->hdr.pid,
228 			    -1, &verbose, sizeof(verbose));
229 		}
230 		break;
231 	case IMSG_VMDOP_PAUSE_VM:
232 		IMSG_SIZE_CHECK(imsg, &vid);
233 		memcpy(&vid, imsg->data, sizeof(vid));
234 		id = vid.vid_id;
235 		if ((vm = vm_getbyvmid(id)) == NULL) {
236 			res = ENOENT;
237 			cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE;
238 			break;
239 		}
240 		imsg_compose_event(&vm->vm_iev,
241 		    imsg->hdr.type, imsg->hdr.peerid, imsg->hdr.pid,
242 		    imsg_get_fd(imsg), &vid, sizeof(vid));
243 		break;
244 	case IMSG_VMDOP_UNPAUSE_VM:
245 		IMSG_SIZE_CHECK(imsg, &vid);
246 		memcpy(&vid, imsg->data, sizeof(vid));
247 		id = vid.vid_id;
248 		if ((vm = vm_getbyvmid(id)) == NULL) {
249 			res = ENOENT;
250 			cmd = IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
251 			break;
252 		}
253 		imsg_compose_event(&vm->vm_iev,
254 		    imsg->hdr.type, imsg->hdr.peerid, imsg->hdr.pid,
255 		    imsg_get_fd(imsg), &vid, sizeof(vid));
256 		break;
257 	case IMSG_VMDOP_SEND_VM_REQUEST:
258 		IMSG_SIZE_CHECK(imsg, &vid);
259 		memcpy(&vid, imsg->data, sizeof(vid));
260 		id = vid.vid_id;
261 		if ((vm = vm_getbyvmid(id)) == NULL) {
262 			res = ENOENT;
263 			close(imsg_get_fd(imsg));	/* XXX */
264 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
265 			break;
266 		}
267 		imsg_compose_event(&vm->vm_iev,
268 		    imsg->hdr.type, imsg->hdr.peerid, imsg->hdr.pid,
269 		    imsg_get_fd(imsg), &vid, sizeof(vid));
270 		break;
271 	case IMSG_VMDOP_RECEIVE_VM_REQUEST:
272 		IMSG_SIZE_CHECK(imsg, &vmc);
273 		memcpy(&vmc, imsg->data, sizeof(vmc));
274 		if (vm_register(ps, &vmc, &vm,
275 		    imsg->hdr.peerid, vmc.vmc_owner.uid) != 0) {
276 			res = errno;
277 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
278 			break;
279 		}
280 		vm->vm_tty = imsg_get_fd(imsg);
281 		vm->vm_state |= VM_STATE_RECEIVED;
282 		vm->vm_state |= VM_STATE_PAUSED;
283 		break;
284 	case IMSG_VMDOP_RECEIVE_VM_END:
285 		if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) {
286 			res = ENOENT;
287 			close(imsg_get_fd(imsg));	/* XXX */
288 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
289 			break;
290 		}
291 		vm->vm_receive_fd = imsg_get_fd(imsg);
292 		res = vmm_start_vm(imsg, &id, &pid);
293 		/* Check if the ID can be mapped correctly */
294 		if ((id = vm_id2vmid(id, NULL)) == 0)
295 			res = ENOENT;
296 		cmd = IMSG_VMDOP_START_VM_RESPONSE;
297 		break;
298 	case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE:
299 		IMSG_SIZE_CHECK(imsg, &var);
300 		memcpy(&var, imsg->data, sizeof(var));
301 		if ((vm = vm_getbyvmid(var.var_vmid)) == NULL) {
302 			res = ENOENT;
303 			break;
304 		}
305 		/* Forward hardware address details to the guest vm */
306 		imsg_compose_event(&vm->vm_iev,
307 		    imsg->hdr.type, imsg->hdr.peerid, imsg->hdr.pid,
308 		    imsg_get_fd(imsg), &var, sizeof(var));
309 		break;
310 	case IMSG_VMDOP_RECEIVE_VMM_FD:
311 		if (env->vmd_fd > -1)
312 			fatalx("already received vmm fd");
313 		env->vmd_fd = imsg_get_fd(imsg);
314 
315 		/* Get and terminate all running VMs */
316 		get_info_vm(ps, NULL, 1);
317 		break;
318 	case IMSG_VMDOP_RECEIVE_PSP_FD:
319 		if (env->vmd_psp_fd > -1)
320 			fatalx("already received psp fd");
321 		env->vmd_psp_fd = imsg_get_fd(imsg);
322 		break;
323 	default:
324 		return (-1);
325 	}
326 
327 	switch (cmd) {
328 	case 0:
329 		break;
330 	case IMSG_VMDOP_START_VM_RESPONSE:
331 		if (res != 0) {
332 			/* Remove local reference if it exists */
333 			if ((vm = vm_getbyvmid(imsg->hdr.peerid)) != NULL) {
334 				log_debug("%s: removing vm, START_VM_RESPONSE",
335 				    __func__);
336 				vm_remove(vm, __func__);
337 			}
338 		}
339 		if (id == 0)
340 			id = imsg->hdr.peerid;
341 		/* FALLTHROUGH */
342 	case IMSG_VMDOP_PAUSE_VM_RESPONSE:
343 	case IMSG_VMDOP_UNPAUSE_VM_RESPONSE:
344 	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
345 		memset(&vmr, 0, sizeof(vmr));
346 		vmr.vmr_result = res;
347 		vmr.vmr_id = id;
348 		vmr.vmr_pid = pid;
349 		if (proc_compose_imsg(ps, PROC_PARENT, -1, cmd,
350 		    peerid, -1, &vmr, sizeof(vmr)) == -1)
351 			return (-1);
352 		break;
353 	default:
354 		if (proc_compose_imsg(ps, PROC_PARENT, -1, cmd,
355 		    peerid, -1, &res, sizeof(res)) == -1)
356 			return (-1);
357 		break;
358 	}
359 
360 	return (0);
361 }
362 
363 void
364 vmm_sighdlr(int sig, short event, void *arg)
365 {
366 	struct privsep *ps = arg;
367 	int status, ret = 0;
368 	uint32_t vmid;
369 	pid_t pid;
370 	struct vmop_result vmr;
371 	struct vmd_vm *vm;
372 	struct vm_terminate_params vtp;
373 
374 	log_debug("%s: handling signal %d", __func__, sig);
375 	switch (sig) {
376 	case SIGCHLD:
377 		do {
378 			pid = waitpid(-1, &status, WNOHANG);
379 			if (pid <= 0)
380 				continue;
381 
382 			if (WIFEXITED(status) || WIFSIGNALED(status)) {
383 				vm = vm_getbypid(pid);
384 				if (vm == NULL) {
385 					/*
386 					 * If the VM is gone already, it
387 					 * got terminated via a
388 					 * IMSG_VMDOP_TERMINATE_VM_REQUEST.
389 					 */
390 					continue;
391 				}
392 
393 				if (WIFEXITED(status))
394 					ret = WEXITSTATUS(status);
395 
396 				/* Don't reboot on pending shutdown */
397 				if (ret == EAGAIN &&
398 				    (vm->vm_state & VM_STATE_SHUTDOWN))
399 					ret = 0;
400 
401 				vmid = vm->vm_params.vmc_params.vcp_id;
402 				vtp.vtp_vm_id = vmid;
403 
404 				if (terminate_vm(&vtp) == 0)
405 					log_debug("%s: terminated vm %s"
406 					    " (id %d)", __func__,
407 					    vm->vm_params.vmc_params.vcp_name,
408 					    vm->vm_vmid);
409 
410 				memset(&vmr, 0, sizeof(vmr));
411 				vmr.vmr_result = ret;
412 				vmr.vmr_id = vm_id2vmid(vmid, vm);
413 				if (proc_compose_imsg(ps, PROC_PARENT,
414 				    -1, IMSG_VMDOP_TERMINATE_VM_EVENT,
415 				    vm->vm_peerid, -1,
416 				    &vmr, sizeof(vmr)) == -1)
417 					log_warnx("could not signal "
418 					    "termination of VM %u to "
419 					    "parent", vm->vm_vmid);
420 
421 				vm_remove(vm, __func__);
422 			} else
423 				fatalx("unexpected cause of SIGCHLD");
424 		} while (pid > 0 || (pid == -1 && errno == EINTR));
425 		break;
426 	default:
427 		fatalx("unexpected signal");
428 	}
429 }
430 
431 /*
432  * vmm_shutdown
433  *
434  * Terminate VMs on shutdown to avoid "zombie VM" processes.
435  */
436 void
437 vmm_shutdown(void)
438 {
439 	struct vm_terminate_params vtp;
440 	struct vmd_vm *vm, *vm_next;
441 
442 	TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry, vm_next) {
443 		vtp.vtp_vm_id = vm_vmid2id(vm->vm_vmid, vm);
444 
445 		/* XXX suspend or request graceful shutdown */
446 		(void)terminate_vm(&vtp);
447 		vm_remove(vm, __func__);
448 	}
449 }
450 
451 /*
452  * vmm_pipe
453  *
454  * Create a new imsg control channel between vmm parent and a VM
455  * (can be called on both sides).
456  */
457 int
458 vmm_pipe(struct vmd_vm *vm, int fd, void (*cb)(int, short, void *))
459 {
460 	struct imsgev	*iev = &vm->vm_iev;
461 
462 	/*
463 	 * Set to close-on-exec as vmm_pipe is used after fork+exec to
464 	 * establish async ipc between vm and vmd's vmm process. This
465 	 * prevents future vm processes or virtio subprocesses from
466 	 * inheriting this control channel.
467 	 */
468 	if (fcntl(fd, F_SETFD, FD_CLOEXEC) == -1) {
469 		log_warn("failed to set close-on-exec for vmm ipc channel");
470 		return (-1);
471 	}
472 
473 	if (imsgbuf_init(&iev->ibuf, fd) == -1) {
474 		log_warn("failed to init imsgbuf");
475 		return (-1);
476 	}
477 	imsgbuf_allow_fdpass(&iev->ibuf);
478 	iev->handler = cb;
479 	iev->data = vm;
480 	imsg_event_add(iev);
481 
482 	return (0);
483 }
484 
485 /*
486  * vmm_dispatch_vm
487  *
488  * imsg callback for messages that are received from a VM child process.
489  */
490 void
491 vmm_dispatch_vm(int fd, short event, void *arg)
492 {
493 	struct vmd_vm		*vm = arg;
494 	struct vmop_result	 vmr;
495 	struct imsgev		*iev = &vm->vm_iev;
496 	struct imsgbuf		*ibuf = &iev->ibuf;
497 	struct imsg		 imsg;
498 	ssize_t			 n;
499 	unsigned int		 i;
500 
501 	if (event & EV_READ) {
502 		if ((n = imsgbuf_read(ibuf)) == -1)
503 			fatal("%s: imsgbuf_read", __func__);
504 		if (n == 0) {
505 			/* This pipe is dead, so remove the event handler */
506 			event_del(&iev->ev);
507 			return;
508 		}
509 	}
510 
511 	if (event & EV_WRITE) {
512 		if (imsgbuf_write(ibuf) == -1) {
513 			if (errno == EPIPE) {
514 				/* This pipe is dead, remove the handler */
515 				event_del(&iev->ev);
516 				return;
517 			}
518 			fatal("%s: imsgbuf_write fd %d", __func__, ibuf->fd);
519 		}
520 	}
521 
522 	for (;;) {
523 		if ((n = imsg_get(ibuf, &imsg)) == -1)
524 			fatal("%s: imsg_get", __func__);
525 		if (n == 0)
526 			break;
527 
528 		DPRINTF("%s: got imsg %d from %s",
529 		    __func__, imsg.hdr.type,
530 		    vm->vm_params.vmc_params.vcp_name);
531 
532 		switch (imsg.hdr.type) {
533 		case IMSG_VMDOP_VM_SHUTDOWN:
534 			vm->vm_state |= VM_STATE_SHUTDOWN;
535 			break;
536 		case IMSG_VMDOP_VM_REBOOT:
537 			vm->vm_state &= ~VM_STATE_SHUTDOWN;
538 			break;
539 		case IMSG_VMDOP_SEND_VM_RESPONSE:
540 			IMSG_SIZE_CHECK(&imsg, &vmr);
541 		case IMSG_VMDOP_PAUSE_VM_RESPONSE:
542 		case IMSG_VMDOP_UNPAUSE_VM_RESPONSE:
543 			for (i = 0; i < nitems(procs); i++) {
544 				if (procs[i].p_id == PROC_PARENT) {
545 					proc_forward_imsg(procs[i].p_ps,
546 					    &imsg, PROC_PARENT, -1);
547 					break;
548 				}
549 			}
550 			break;
551 
552 		default:
553 			fatalx("%s: got invalid imsg %d from %s",
554 			    __func__, imsg.hdr.type,
555 			    vm->vm_params.vmc_params.vcp_name);
556 		}
557 		imsg_free(&imsg);
558 	}
559 	imsg_event_add(iev);
560 }
561 
562 /*
563  * terminate_vm
564  *
565  * Requests vmm(4) to terminate the VM whose ID is provided in the
566  * supplied vm_terminate_params structure (vtp->vtp_vm_id)
567  *
568  * Parameters
569  *  vtp: vm_terminate_params struct containing the ID of the VM to terminate
570  *
571  * Return values:
572  *  0: success
573  *  !0: ioctl to vmm(4) failed (eg, ENOENT if the supplied VM is not valid)
574  */
575 int
576 terminate_vm(struct vm_terminate_params *vtp)
577 {
578 	if (ioctl(env->vmd_fd, VMM_IOC_TERM, vtp) == -1)
579 		return (errno);
580 
581 	return (0);
582 }
583 
584 /*
585  * opentap
586  *
587  * Opens the next available tap device, up to MAX_TAP.
588  *
589  * Parameters
590  *  ifname: a buffer of at least IF_NAMESIZE bytes.
591  *
592  * Returns a file descriptor to the tap node opened or -1 if no tap devices were
593  * available, setting errno to the open(2) error.
594  */
595 int
596 opentap(char *ifname)
597 {
598 	int err = 0, i, fd;
599 	char path[PATH_MAX];
600 
601 	for (i = 0; i < MAX_TAP; i++) {
602 		snprintf(path, PATH_MAX, "/dev/tap%d", i);
603 
604 		errno = 0;
605 		fd = open(path, O_RDWR | O_NONBLOCK);
606 		if (fd != -1)
607 			break;
608 		err = errno;
609 		if (err == EBUSY) {
610 			/* Busy...try next tap. */
611 			continue;
612 		} else if (err == ENOENT) {
613 			/* Ran out of /dev/tap* special files. */
614 			break;
615 		} else {
616 			log_warn("%s: unexpected error", __func__);
617 			break;
618 		}
619 	}
620 
621 	/* Record the last opened tap device. */
622 	snprintf(ifname, IF_NAMESIZE, "tap%d", i);
623 
624 	if (err)
625 		errno = err;
626 	return (fd);
627 }
628 
629 /*
630  * vmm_start_vm
631  *
632  * Prepares and fork+execs a new VM process.
633  *
634  * Parameters:
635  *  imsg: The VM data structure that is including the VM create parameters.
636  *  id: Returns the VM id as reported by the kernel and obtained from the VM.
637  *  pid: Returns the VM pid to the parent.
638  *
639  * Return values:
640  *  0: success
641  *  !0: failure - typically an errno indicating the source of the failure
642  */
643 int
644 vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid)
645 {
646 	struct vm_create_params	*vcp;
647 	struct vmd_vm		*vm;
648 	char			*nargv[10], num[32], vmm_fd[32], psp_fd[32];
649 	int			 fd, ret = EINVAL;
650 	int			 fds[2];
651 	pid_t			 vm_pid;
652 	size_t			 i, j, sz;
653 
654 	if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) {
655 		log_warnx("%s: can't find vm", __func__);
656 		return (ENOENT);
657 	}
658 	vcp = &vm->vm_params.vmc_params;
659 
660 	if (!(vm->vm_state & VM_STATE_RECEIVED)) {
661 		if ((vm->vm_tty = imsg_get_fd(imsg)) == -1) {
662 			log_warnx("%s: can't get tty", __func__);
663 			goto err;
664 		}
665 	}
666 
667 	if (socketpair(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, PF_UNSPEC, fds)
668 	    == -1)
669 		fatal("socketpair");
670 
671 	/* Start child vmd for this VM (fork, chroot, drop privs) */
672 	vm_pid = fork();
673 	if (vm_pid == -1) {
674 		log_warn("%s: start child failed", __func__);
675 		ret = EIO;
676 		goto err;
677 	}
678 
679 	if (vm_pid > 0) {
680 		/* Parent */
681 		vm->vm_pid = vm_pid;
682 		close_fd(fds[1]);
683 
684 		/* Send the details over the pipe to the child. */
685 		sz = atomicio(vwrite, fds[0], vm, sizeof(*vm));
686 		if (sz != sizeof(*vm)) {
687 			log_warnx("%s: failed to send config for vm '%s'",
688 			    __func__, vcp->vcp_name);
689 			ret = EIO;
690 			/* Defer error handling until after fd closing. */
691 		}
692 
693 		/* As the parent/vmm process, we no longer need these fds. */
694 		for (i = 0 ; i < vm->vm_params.vmc_ndisks; i++) {
695 			for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
696 				if (close_fd(vm->vm_disks[i][j]) == 0)
697 				    vm->vm_disks[i][j] = -1;
698 			}
699 		}
700 		for (i = 0 ; i < vm->vm_params.vmc_nnics; i++) {
701 			if (close_fd(vm->vm_ifs[i].vif_fd) == 0)
702 			    vm->vm_ifs[i].vif_fd = -1;
703 		}
704 		if (close_fd(vm->vm_kernel) == 0)
705 			vm->vm_kernel = -1;
706 		if (close_fd(vm->vm_cdrom) == 0)
707 			vm->vm_cdrom = -1;
708 		if (close_fd(vm->vm_tty) == 0)
709 			vm->vm_tty = -1;
710 
711 		/* Deferred error handling from sending the vm struct. */
712 		if (ret == EIO)
713 			goto err;
714 
715 		/* Send the current local prefix configuration. */
716 		sz = atomicio(vwrite, fds[0], &env->vmd_cfg.cfg_localprefix,
717 		    sizeof(env->vmd_cfg.cfg_localprefix));
718 		if (sz != sizeof(env->vmd_cfg.cfg_localprefix)) {
719 			log_warnx("%s: failed to send local prefix for vm '%s'",
720 			    __func__, vcp->vcp_name);
721 			ret = EIO;
722 			goto err;
723 		}
724 
725 		/* Read back the kernel-generated vm id from the child */
726 		sz = atomicio(read, fds[0], &vcp->vcp_id, sizeof(vcp->vcp_id));
727 		if (sz != sizeof(vcp->vcp_id)) {
728 			log_debug("%s: failed to receive vm id from vm %s",
729 			    __func__, vcp->vcp_name);
730 			/* vmd could not allocate memory for the vm. */
731 			ret = ENOMEM;
732 			goto err;
733 		}
734 
735 		/* Check for an invalid id. This indicates child failure. */
736 		if (vcp->vcp_id == 0)
737 			goto err;
738 
739 		*id = vcp->vcp_id;
740 		*pid = vm->vm_pid;
741 
742 		/* Wire up our pipe into the event handling. */
743 		if (vmm_pipe(vm, fds[0], vmm_dispatch_vm) == -1)
744 			fatal("setup vm pipe");
745 	} else {
746 		/* Child. Create a new session. */
747 		if (setsid() == -1)
748 			fatal("setsid");
749 
750 		close_fd(fds[0]);
751 		close_fd(PROC_PARENT_SOCK_FILENO);
752 
753 		/* Detach from terminal. */
754 		if (!env->vmd_debug && (fd =
755 			open("/dev/null", O_RDWR, 0)) != -1) {
756 			dup2(fd, STDIN_FILENO);
757 			dup2(fd, STDOUT_FILENO);
758 			dup2(fd, STDERR_FILENO);
759 			if (fd > 2)
760 				close(fd);
761 		}
762 
763 		if (env->vmd_psp_fd > 0)
764 			fcntl(env->vmd_psp_fd, F_SETFD, 0); /* psp device fd */
765 
766 		/*
767 		 * Prepare our new argv for execvp(2) with the fd of our open
768 		 * pipe to the parent/vmm process as an argument.
769 		 */
770 		memset(num, 0, sizeof(num));
771 		snprintf(num, sizeof(num), "%d", fds[1]);
772 		memset(vmm_fd, 0, sizeof(vmm_fd));
773 		snprintf(vmm_fd, sizeof(vmm_fd), "%d", env->vmd_fd);
774 		memset(psp_fd, 0, sizeof(psp_fd));
775 		snprintf(psp_fd, sizeof(psp_fd), "%d", env->vmd_psp_fd);
776 
777 		i = 0;
778 		nargv[i++] = env->argv0;
779 		nargv[i++] = "-V";
780 		nargv[i++] = num;
781 		nargv[i++] = "-i";
782 		nargv[i++] = vmm_fd;
783 		nargv[i++] = "-j";
784 		nargv[i++] = psp_fd;
785 		if (env->vmd_debug)
786 			nargv[i++] = "-d";
787 		if (env->vmd_verbose == 1)
788 			nargv[i++] = "-v";
789 		else if (env->vmd_verbose > 1)
790 			nargv[i++] = "-vv";
791 		nargv[i++] = NULL;
792 		if (i > sizeof(nargv) / sizeof(nargv[0]))
793 			fatalx("%s: nargv overflow", __func__);
794 
795 		/* Control resumes in vmd main(). */
796 		execvp(nargv[0], nargv);
797 
798 		ret = errno;
799 		log_warn("execvp %s", nargv[0]);
800 		_exit(ret);
801 		/* NOTREACHED */
802 	}
803 
804 	return (0);
805 
806  err:
807 	if (!vm->vm_from_config)
808 		vm_remove(vm, __func__);
809 
810 	return (ret);
811 }
812 
813 /*
814  * get_info_vm
815  *
816  * Returns a list of VMs known to vmm(4).
817  *
818  * Parameters:
819  *  ps: the privsep context.
820  *  imsg: the received imsg including the peer id.
821  *  terminate: terminate the listed vm.
822  *
823  * Return values:
824  *  0: success
825  *  !0: failure (eg, ENOMEM, EIO or another error code from vmm(4) ioctl)
826  */
827 int
828 get_info_vm(struct privsep *ps, struct imsg *imsg, int terminate)
829 {
830 	int ret;
831 	size_t ct, i;
832 	struct vm_info_params vip;
833 	struct vm_info_result *info;
834 	struct vm_terminate_params vtp;
835 	struct vmop_info_result vir;
836 
837 	/*
838 	 * We issue the VMM_IOC_INFO ioctl twice, once with an input
839 	 * buffer size of 0, which results in vmm(4) returning the
840 	 * number of bytes required back to us in vip.vip_size,
841 	 * and then we call it again after malloc'ing the required
842 	 * number of bytes.
843 	 *
844 	 * It is possible that we could fail a second time (e.g. if
845 	 * another VM was created in the instant between the two
846 	 * ioctls, but in that case the caller can just try again
847 	 * as vmm(4) will return a zero-sized list in that case.
848 	 */
849 	vip.vip_size = 0;
850 	info = NULL;
851 	ret = 0;
852 	memset(&vir, 0, sizeof(vir));
853 
854 	/* First ioctl to see how many bytes needed (vip.vip_size) */
855 	if (ioctl(env->vmd_fd, VMM_IOC_INFO, &vip) == -1)
856 		return (errno);
857 
858 	if (vip.vip_info_ct != 0)
859 		return (EIO);
860 
861 	info = malloc(vip.vip_size);
862 	if (info == NULL)
863 		return (ENOMEM);
864 
865 	/* Second ioctl to get the actual list */
866 	vip.vip_info = info;
867 	if (ioctl(env->vmd_fd, VMM_IOC_INFO, &vip) == -1) {
868 		ret = errno;
869 		free(info);
870 		return (ret);
871 	}
872 
873 	/* Return info */
874 	ct = vip.vip_size / sizeof(struct vm_info_result);
875 	for (i = 0; i < ct; i++) {
876 		if (terminate) {
877 			vtp.vtp_vm_id = info[i].vir_id;
878 			if ((ret = terminate_vm(&vtp)) != 0)
879 				break;
880 			log_debug("%s: terminated vm %s (id %d)", __func__,
881 			    info[i].vir_name, info[i].vir_id);
882 			continue;
883 		}
884 		memcpy(&vir.vir_info, &info[i], sizeof(vir.vir_info));
885 		vir.vir_info.vir_id = vm_id2vmid(info[i].vir_id, NULL);
886 		if (proc_compose_imsg(ps, PROC_PARENT, -1,
887 		    IMSG_VMDOP_GET_INFO_VM_DATA, imsg->hdr.peerid, -1,
888 		    &vir, sizeof(vir)) == -1) {
889 			ret = EIO;
890 			break;
891 		}
892 	}
893 	free(info);
894 
895 	return (ret);
896 }
897