xref: /openbsd-src/usr.sbin/vmd/vmd.c (revision 8550894424f8a4aa4aafb6cd57229dd6ed7cd9dd)
1 /*	$OpenBSD: vmd.c,v 1.137 2023/01/22 22:18:40 dv Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Reyk Floeter <reyk@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 #include <sys/queue.h>
21 #include <sys/wait.h>
22 #include <sys/stat.h>
23 #include <sys/sysctl.h>
24 #include <sys/tty.h>
25 #include <sys/ttycom.h>
26 #include <sys/ioctl.h>
27 
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <termios.h>
32 #include <errno.h>
33 #include <event.h>
34 #include <fcntl.h>
35 #include <pwd.h>
36 #include <signal.h>
37 #include <syslog.h>
38 #include <unistd.h>
39 #include <util.h>
40 #include <ctype.h>
41 #include <pwd.h>
42 #include <grp.h>
43 
44 #include <machine/specialreg.h>
45 #include <machine/vmmvar.h>
46 
47 #include "proc.h"
48 #include "atomicio.h"
49 #include "vmd.h"
50 
51 __dead void usage(void);
52 
53 int	 main(int, char **);
54 int	 vmd_configure(void);
55 void	 vmd_sighdlr(int sig, short event, void *arg);
56 void	 vmd_shutdown(void);
57 int	 vmd_control_run(void);
58 int	 vmd_dispatch_control(int, struct privsep_proc *, struct imsg *);
59 int	 vmd_dispatch_vmm(int, struct privsep_proc *, struct imsg *);
60 int	 vmd_dispatch_agentx(int, struct privsep_proc *, struct imsg *);
61 int	 vmd_dispatch_priv(int, struct privsep_proc *, struct imsg *);
62 int	 vmd_check_vmh(struct vm_dump_header *);
63 
64 int	 vm_instance(struct privsep *, struct vmd_vm **,
65 	    struct vmop_create_params *, uid_t);
66 int	 vm_checkinsflag(struct vmop_create_params *, unsigned int, uid_t);
67 int	 vm_claimid(const char *, int, uint32_t *);
68 void	 start_vm_batch(int, short, void*);
69 
70 static inline void vm_terminate(struct vmd_vm *, const char *);
71 
72 struct vmd	*env;
73 
74 static struct privsep_proc procs[] = {
75 	/* Keep "priv" on top as procs[0] */
76 	{ "priv",	PROC_PRIV,	vmd_dispatch_priv, priv },
77 	{ "control",	PROC_CONTROL,	vmd_dispatch_control, control },
78 	{ "vmm",	PROC_VMM,	vmd_dispatch_vmm, vmm, vmm_shutdown },
79 	{ "agentx", 	PROC_AGENTX,	vmd_dispatch_agentx, vm_agentx, vm_agentx_shutdown, "/" }
80 };
81 
82 enum privsep_procid privsep_process;
83 
84 struct event staggered_start_timer;
85 
86 /* For the privileged process */
87 static struct privsep_proc *proc_priv = &procs[0];
88 static struct passwd proc_privpw;
89 static const uint8_t zero_mac[ETHER_ADDR_LEN];
90 
91 int
92 vmd_dispatch_control(int fd, struct privsep_proc *p, struct imsg *imsg)
93 {
94 	struct privsep			*ps = p->p_ps;
95 	int				 res = 0, ret = 0, cmd = 0, verbose;
96 	unsigned int			 v = 0, flags;
97 	struct vmop_create_params	 vmc;
98 	struct vmop_id			 vid;
99 	struct vmop_result		 vmr;
100 	struct vm_dump_header		 vmh;
101 	struct vmd_vm			*vm = NULL;
102 	char				*str = NULL;
103 	uint32_t			 id = 0;
104 	struct control_sock		*rcs;
105 
106 	switch (imsg->hdr.type) {
107 	case IMSG_VMDOP_START_VM_REQUEST:
108 		IMSG_SIZE_CHECK(imsg, &vmc);
109 		memcpy(&vmc, imsg->data, sizeof(vmc));
110 		ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid);
111 		if (vmc.vmc_flags == 0) {
112 			/* start an existing VM with pre-configured options */
113 			if (!(ret == -1 && errno == EALREADY &&
114 			    !(vm->vm_state & VM_STATE_RUNNING))) {
115 				res = errno;
116 				cmd = IMSG_VMDOP_START_VM_RESPONSE;
117 			}
118 		} else if (ret != 0) {
119 			res = errno;
120 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
121 		}
122 		if (res == 0) {
123 			res = config_setvm(ps, vm, imsg->hdr.peerid,
124 			    vm->vm_params.vmc_owner.uid);
125 			if (res)
126 				cmd = IMSG_VMDOP_START_VM_RESPONSE;
127 		}
128 		break;
129 	case IMSG_VMDOP_WAIT_VM_REQUEST:
130 	case IMSG_VMDOP_TERMINATE_VM_REQUEST:
131 		IMSG_SIZE_CHECK(imsg, &vid);
132 		memcpy(&vid, imsg->data, sizeof(vid));
133 		flags = vid.vid_flags;
134 		cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
135 
136 		if ((id = vid.vid_id) == 0) {
137 			/* Lookup vm (id) by name */
138 			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
139 				res = ENOENT;
140 				break;
141 			} else if ((vm->vm_state & VM_STATE_SHUTDOWN) &&
142 			    (flags & VMOP_FORCE) == 0) {
143 				res = EALREADY;
144 				break;
145 			} else if (!(vm->vm_state & VM_STATE_RUNNING)) {
146 				res = EINVAL;
147 				break;
148 			}
149 			id = vm->vm_vmid;
150 		} else if ((vm = vm_getbyvmid(id)) == NULL) {
151 			res = ENOENT;
152 			break;
153 		}
154 		if (vm_checkperm(vm, &vm->vm_params.vmc_owner, vid.vid_uid)) {
155 			res = EPERM;
156 			break;
157 		}
158 
159 		/* Only relay TERMINATION requests, not WAIT requests */
160 		if (imsg->hdr.type == IMSG_VMDOP_TERMINATE_VM_REQUEST) {
161 			memset(&vid, 0, sizeof(vid));
162 			vid.vid_id = id;
163 			vid.vid_flags = flags;
164 
165 			if (proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
166 				imsg->hdr.peerid, -1, &vid, sizeof(vid)) == -1)
167 				return (-1);
168 		}
169 		break;
170 	case IMSG_VMDOP_GET_INFO_VM_REQUEST:
171 		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
172 		break;
173 	case IMSG_VMDOP_LOAD:
174 		IMSG_SIZE_CHECK(imsg, str); /* at least one byte for path */
175 		str = get_string((uint8_t *)imsg->data,
176 		    IMSG_DATA_SIZE(imsg));
177 	case IMSG_VMDOP_RELOAD:
178 		if (vmd_reload(0, str) == -1)
179 			cmd = IMSG_CTL_FAIL;
180 		else
181 			cmd = IMSG_CTL_OK;
182 		free(str);
183 		break;
184 	case IMSG_CTL_RESET:
185 		IMSG_SIZE_CHECK(imsg, &v);
186 		memcpy(&v, imsg->data, sizeof(v));
187 		if (vmd_reload(v, NULL) == -1)
188 			cmd = IMSG_CTL_FAIL;
189 		else
190 			cmd = IMSG_CTL_OK;
191 		break;
192 	case IMSG_CTL_VERBOSE:
193 		IMSG_SIZE_CHECK(imsg, &verbose);
194 		memcpy(&verbose, imsg->data, sizeof(verbose));
195 		log_setverbose(verbose);
196 
197 		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
198 		proc_forward_imsg(ps, imsg, PROC_PRIV, -1);
199 		cmd = IMSG_CTL_OK;
200 		break;
201 	case IMSG_VMDOP_PAUSE_VM:
202 	case IMSG_VMDOP_UNPAUSE_VM:
203 		IMSG_SIZE_CHECK(imsg, &vid);
204 		memcpy(&vid, imsg->data, sizeof(vid));
205 		if (vid.vid_id == 0) {
206 			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
207 				res = ENOENT;
208 				cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
209 				    ? IMSG_VMDOP_PAUSE_VM_RESPONSE
210 				    : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
211 				break;
212 			} else {
213 				vid.vid_id = vm->vm_vmid;
214 			}
215 		} else if ((vm = vm_getbyid(vid.vid_id)) == NULL) {
216 			res = ENOENT;
217 			cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
218 			    ? IMSG_VMDOP_PAUSE_VM_RESPONSE
219 			    : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
220 			break;
221 		}
222 		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
223 		    vid.vid_uid) != 0) {
224 			res = EPERM;
225 			cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
226 			    ? IMSG_VMDOP_PAUSE_VM_RESPONSE
227 			    : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
228 			break;
229 		}
230 		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
231 		    imsg->hdr.peerid, -1, &vid, sizeof(vid));
232 		break;
233 	case IMSG_VMDOP_SEND_VM_REQUEST:
234 		IMSG_SIZE_CHECK(imsg, &vid);
235 		memcpy(&vid, imsg->data, sizeof(vid));
236 		id = vid.vid_id;
237 		if (vid.vid_id == 0) {
238 			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
239 				res = ENOENT;
240 				cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
241 				close(imsg->fd);
242 				break;
243 			} else {
244 				vid.vid_id = vm->vm_vmid;
245 			}
246 		} else if ((vm = vm_getbyvmid(vid.vid_id)) == NULL) {
247 			res = ENOENT;
248 			cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
249 			close(imsg->fd);
250 			break;
251 		}
252 		vmr.vmr_id = vid.vid_id;
253 		log_debug("%s: sending fd to vmm", __func__);
254 		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
255 		    imsg->hdr.peerid, imsg->fd, &vid, sizeof(vid));
256 		break;
257 	case IMSG_VMDOP_RECEIVE_VM_REQUEST:
258 		IMSG_SIZE_CHECK(imsg, &vid);
259 		memcpy(&vid, imsg->data, sizeof(vid));
260 		if (imsg->fd == -1) {
261 			log_warnx("%s: invalid fd", __func__);
262 			return (-1);
263 		}
264 		if (atomicio(read, imsg->fd, &vmh, sizeof(vmh)) !=
265 		    sizeof(vmh)) {
266 			log_warnx("%s: error reading vmh from received vm",
267 			    __func__);
268 			res = EIO;
269 			close(imsg->fd);
270 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
271 			break;
272 		}
273 
274 		if (vmd_check_vmh(&vmh)) {
275 			res = ENOENT;
276 			close(imsg->fd);
277 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
278 			break;
279 		}
280 		if (atomicio(read, imsg->fd, &vmc, sizeof(vmc)) !=
281 		    sizeof(vmc)) {
282 			log_warnx("%s: error reading vmc from received vm",
283 			    __func__);
284 			res = EIO;
285 			close(imsg->fd);
286 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
287 			break;
288 		}
289 		strlcpy(vmc.vmc_params.vcp_name, vid.vid_name,
290 		    sizeof(vmc.vmc_params.vcp_name));
291 		vmc.vmc_params.vcp_id = 0;
292 
293 		ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid);
294 		if (ret != 0) {
295 			res = errno;
296 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
297 			close(imsg->fd);
298 		} else {
299 			vm->vm_state |= VM_STATE_RECEIVED;
300 			config_setvm(ps, vm, imsg->hdr.peerid,
301 			    vmc.vmc_owner.uid);
302 			log_debug("%s: sending fd to vmm", __func__);
303 			proc_compose_imsg(ps, PROC_VMM, -1,
304 			    IMSG_VMDOP_RECEIVE_VM_END, vm->vm_vmid, imsg->fd,
305 			    NULL, 0);
306 		}
307 		break;
308 	case IMSG_VMDOP_DONE:
309 		control_reset(&ps->ps_csock);
310 		TAILQ_FOREACH(rcs, &ps->ps_rcsocks, cs_entry)
311 			control_reset(rcs);
312 		cmd = 0;
313 		break;
314 	default:
315 		return (-1);
316 	}
317 
318 	switch (cmd) {
319 	case 0:
320 		break;
321 	case IMSG_VMDOP_START_VM_RESPONSE:
322 	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
323 		memset(&vmr, 0, sizeof(vmr));
324 		vmr.vmr_result = res;
325 		vmr.vmr_id = id;
326 		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
327 		    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
328 			return (-1);
329 		break;
330 	default:
331 		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
332 		    imsg->hdr.peerid, -1, &res, sizeof(res)) == -1)
333 			return (-1);
334 		break;
335 	}
336 
337 	return (0);
338 }
339 
340 int
341 vmd_dispatch_vmm(int fd, struct privsep_proc *p, struct imsg *imsg)
342 {
343 	struct vmop_result	 vmr;
344 	struct privsep		*ps = p->p_ps;
345 	int			 res = 0;
346 	struct vmd_vm		*vm;
347 	struct vm_create_params	*vcp;
348 	struct vmop_info_result	 vir;
349 
350 	switch (imsg->hdr.type) {
351 	case IMSG_VMDOP_PAUSE_VM_RESPONSE:
352 		IMSG_SIZE_CHECK(imsg, &vmr);
353 		memcpy(&vmr, imsg->data, sizeof(vmr));
354 		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
355 			break;
356 		proc_compose_imsg(ps, PROC_CONTROL, -1,
357 		    imsg->hdr.type, imsg->hdr.peerid, -1,
358 		    imsg->data, sizeof(imsg->data));
359 		log_info("%s: paused vm %d successfully",
360 		    vm->vm_params.vmc_params.vcp_name,
361 		    vm->vm_vmid);
362 		vm->vm_state |= VM_STATE_PAUSED;
363 		break;
364 	case IMSG_VMDOP_UNPAUSE_VM_RESPONSE:
365 		IMSG_SIZE_CHECK(imsg, &vmr);
366 		memcpy(&vmr, imsg->data, sizeof(vmr));
367 		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
368 			break;
369 		proc_compose_imsg(ps, PROC_CONTROL, -1,
370 		    imsg->hdr.type, imsg->hdr.peerid, -1,
371 		    imsg->data, sizeof(imsg->data));
372 		log_info("%s: unpaused vm %d successfully.",
373 		    vm->vm_params.vmc_params.vcp_name,
374 		    vm->vm_vmid);
375 		vm->vm_state &= ~VM_STATE_PAUSED;
376 		break;
377 	case IMSG_VMDOP_START_VM_RESPONSE:
378 		IMSG_SIZE_CHECK(imsg, &vmr);
379 		memcpy(&vmr, imsg->data, sizeof(vmr));
380 		if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL)
381 			break;
382 		vm->vm_pid = vmr.vmr_pid;
383 		vcp = &vm->vm_params.vmc_params;
384 		vcp->vcp_id = vmr.vmr_id;
385 
386 		/*
387 		 * If the peerid is not -1, forward the response back to the
388 		 * the control socket.  If it is -1, the request originated
389 		 * from the parent, not the control socket.
390 		 */
391 		if (vm->vm_peerid != (uint32_t)-1) {
392 			(void)strlcpy(vmr.vmr_ttyname, vm->vm_ttyname,
393 			    sizeof(vmr.vmr_ttyname));
394 			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
395 			    imsg->hdr.type, vm->vm_peerid, -1,
396 			    &vmr, sizeof(vmr)) == -1) {
397 				errno = vmr.vmr_result;
398 				log_warn("%s: failed to forward vm result",
399 				    vcp->vcp_name);
400 				vm_terminate(vm, __func__);
401 				return (-1);
402 			}
403 		}
404 
405 		if (vmr.vmr_result) {
406 			log_warnx("%s: failed to start vm", vcp->vcp_name);
407 			vm_terminate(vm, __func__);
408 			errno = vmr.vmr_result;
409 			break;
410 		}
411 
412 		/* Now configure all the interfaces */
413 		if (vm_priv_ifconfig(ps, vm) == -1) {
414 			log_warn("%s: failed to configure vm", vcp->vcp_name);
415 			vm_terminate(vm, __func__);
416 			break;
417 		}
418 
419 		log_info("%s: started vm %d successfully, tty %s",
420 		    vcp->vcp_name, vm->vm_vmid, vm->vm_ttyname);
421 		break;
422 	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
423 		IMSG_SIZE_CHECK(imsg, &vmr);
424 		memcpy(&vmr, imsg->data, sizeof(vmr));
425 
426 		if (vmr.vmr_result) {
427 			DPRINTF("%s: forwarding TERMINATE VM for vm id %d",
428 			    __func__, vmr.vmr_id);
429 			proc_forward_imsg(ps, imsg, PROC_CONTROL, -1);
430 		} else {
431 			if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
432 				break;
433 			/* Mark VM as shutting down */
434 			vm->vm_state |= VM_STATE_SHUTDOWN;
435 		}
436 		break;
437 	case IMSG_VMDOP_SEND_VM_RESPONSE:
438 		IMSG_SIZE_CHECK(imsg, &vmr);
439 		memcpy(&vmr, imsg->data, sizeof(vmr));
440 		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
441 			break;
442 		if (!vmr.vmr_result) {
443 			log_info("%s: sent vm %d successfully.",
444 			    vm->vm_params.vmc_params.vcp_name,
445 			    vm->vm_vmid);
446 			vm_terminate(vm, __func__);
447 		}
448 
449 		/* Send a response if a control client is waiting for it */
450 		if (imsg->hdr.peerid != (uint32_t)-1) {
451 			/* the error is meaningless for deferred responses */
452 			vmr.vmr_result = 0;
453 
454 			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
455 			    IMSG_VMDOP_SEND_VM_RESPONSE,
456 			    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
457 				return (-1);
458 		}
459 		break;
460 	case IMSG_VMDOP_TERMINATE_VM_EVENT:
461 		IMSG_SIZE_CHECK(imsg, &vmr);
462 		memcpy(&vmr, imsg->data, sizeof(vmr));
463 		DPRINTF("%s: handling TERMINATE_EVENT for vm id %d ret %d",
464 		    __func__, vmr.vmr_id, vmr.vmr_result);
465 		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) {
466 			log_debug("%s: vm %d is no longer available",
467 			    __func__, vmr.vmr_id);
468 			break;
469 		}
470 		if (vmr.vmr_result != EAGAIN ||
471 		    vm->vm_params.vmc_bootdevice) {
472 			vm_terminate(vm, __func__);
473 		} else {
474 			/* Stop VM instance but keep the tty open */
475 			vm_stop(vm, 1, __func__);
476 			config_setvm(ps, vm, (uint32_t)-1, vm->vm_uid);
477 		}
478 
479 		/* The error is meaningless for deferred responses */
480 		vmr.vmr_result = 0;
481 
482 		if (proc_compose_imsg(ps, PROC_CONTROL, -1,
483 			IMSG_VMDOP_TERMINATE_VM_EVENT,
484 			imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
485 			return (-1);
486 		break;
487 	case IMSG_VMDOP_GET_INFO_VM_DATA:
488 		IMSG_SIZE_CHECK(imsg, &vir);
489 		memcpy(&vir, imsg->data, sizeof(vir));
490 		if ((vm = vm_getbyvmid(vir.vir_info.vir_id)) != NULL) {
491 			memset(vir.vir_ttyname, 0, sizeof(vir.vir_ttyname));
492 			if (vm->vm_ttyname != NULL)
493 				strlcpy(vir.vir_ttyname, vm->vm_ttyname,
494 				    sizeof(vir.vir_ttyname));
495 			log_debug("%s: running vm: %d, vm_state: 0x%x",
496 			    __func__, vm->vm_vmid, vm->vm_state);
497 			vir.vir_state = vm->vm_state;
498 			/* get the user id who started the vm */
499 			vir.vir_uid = vm->vm_uid;
500 			vir.vir_gid = vm->vm_params.vmc_owner.gid;
501 		}
502 		if (proc_compose_imsg(ps,
503 		    imsg->hdr.peerid == IMSG_AGENTX_PEERID ?
504 		    PROC_AGENTX : PROC_CONTROL, -1, imsg->hdr.type,
505 		    imsg->hdr.peerid, -1, &vir, sizeof(vir)) == -1) {
506 			log_debug("%s: GET_INFO_VM failed for vm %d, removing",
507 			    __func__, vm->vm_vmid);
508 			vm_terminate(vm, __func__);
509 			return (-1);
510 		}
511 		break;
512 	case IMSG_VMDOP_GET_INFO_VM_END_DATA:
513 		/*
514 		 * PROC_VMM has responded with the *running* VMs, now we
515 		 * append the others. These use the special value 0 for their
516 		 * kernel id to indicate that they are not running.
517 		 */
518 		TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
519 			if (!(vm->vm_state & VM_STATE_RUNNING)) {
520 				memset(&vir, 0, sizeof(vir));
521 				vir.vir_info.vir_id = vm->vm_vmid;
522 				strlcpy(vir.vir_info.vir_name,
523 				    vm->vm_params.vmc_params.vcp_name,
524 				    VMM_MAX_NAME_LEN);
525 				vir.vir_info.vir_memory_size =
526 				    vm->vm_params.vmc_params.
527 				    vcp_memranges[0].vmr_size;
528 				vir.vir_info.vir_ncpus =
529 				    vm->vm_params.vmc_params.vcp_ncpus;
530 				/* get the configured user id for this vm */
531 				vir.vir_uid = vm->vm_params.vmc_owner.uid;
532 				vir.vir_gid = vm->vm_params.vmc_owner.gid;
533 				log_debug("%s: vm: %d, vm_state: 0x%x",
534 				    __func__, vm->vm_vmid, vm->vm_state);
535 				vir.vir_state = vm->vm_state;
536 				if (proc_compose_imsg(ps,
537 				    imsg->hdr.peerid == IMSG_AGENTX_PEERID ?
538 				    PROC_AGENTX : PROC_CONTROL, -1,
539 				    IMSG_VMDOP_GET_INFO_VM_DATA,
540 				    imsg->hdr.peerid, -1, &vir,
541 				    sizeof(vir)) == -1) {
542 					log_debug("%s: GET_INFO_VM_END failed",
543 					    __func__);
544 					vm_terminate(vm, __func__);
545 					return (-1);
546 				}
547 			}
548 		}
549 		IMSG_SIZE_CHECK(imsg, &res);
550 		proc_forward_imsg(ps, imsg,
551 		    imsg->hdr.peerid == IMSG_AGENTX_PEERID ?
552 		    PROC_AGENTX : PROC_CONTROL, -1);
553 		break;
554 	default:
555 		return (-1);
556 	}
557 
558 	return (0);
559 }
560 
561 int
562 vmd_dispatch_agentx(int fd, struct privsep_proc *p, struct imsg *imsg)
563 {
564 	struct privsep			*ps = p->p_ps;
565 
566 	switch (imsg->hdr.type) {
567 	case IMSG_VMDOP_GET_INFO_VM_REQUEST:
568 		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
569 		return (0);
570 	default:
571 		break;
572 	}
573 	return (-1);
574 }
575 
576 int
577 vmd_dispatch_priv(int fd, struct privsep_proc *p, struct imsg *imsg)
578 {
579 	struct vmop_addr_result	 var;
580 
581 	switch (imsg->hdr.type) {
582 	case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE:
583 		IMSG_SIZE_CHECK(imsg, &var);
584 		memcpy(&var, imsg->data, sizeof(var));
585 		proc_forward_imsg(p->p_ps, imsg, PROC_VMM, -1);
586 		break;
587 	default:
588 		return (-1);
589 	}
590 
591 	return (0);
592 }
593 
594 int
595 vmd_check_vmh(struct vm_dump_header *vmh)
596 {
597 	int i;
598 	unsigned int code, leaf;
599 	unsigned int a, b, c, d;
600 
601 	if (strncmp(vmh->vmh_signature, VM_DUMP_SIGNATURE, strlen(VM_DUMP_SIGNATURE)) != 0) {
602 		log_warnx("%s: incompatible dump signature", __func__);
603 		return (-1);
604 	}
605 
606 	if (vmh->vmh_version != VM_DUMP_VERSION) {
607 		log_warnx("%s: incompatible dump version", __func__);
608 		return (-1);
609 	}
610 
611 	for (i = 0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
612 		code = vmh->vmh_cpuids[i].code;
613 		leaf = vmh->vmh_cpuids[i].leaf;
614 		if (leaf != 0x00) {
615 			log_debug("%s: invalid leaf 0x%x for code 0x%x",
616 			    __func__, leaf, code);
617 			return (-1);
618 		}
619 
620 		switch (code) {
621 		case 0x00:
622 			CPUID_LEAF(code, leaf, a, b, c, d);
623 			if (vmh->vmh_cpuids[i].a > a) {
624 				log_debug("%s: incompatible cpuid level",
625 				    __func__);
626 				return (-1);
627 			}
628 			if (!(vmh->vmh_cpuids[i].b == b &&
629 			    vmh->vmh_cpuids[i].c == c &&
630 			    vmh->vmh_cpuids[i].d == d)) {
631 				log_debug("%s: incompatible cpu brand",
632 				    __func__);
633 				return (-1);
634 			}
635 			break;
636 
637 		case 0x01:
638 			CPUID_LEAF(code, leaf, a, b, c, d);
639 			if ((vmh->vmh_cpuids[i].c & c & VMM_CPUIDECX_MASK) !=
640 			    (vmh->vmh_cpuids[i].c & VMM_CPUIDECX_MASK)) {
641 				log_debug("%s: incompatible cpu features "
642 				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
643 				    code, leaf);
644 				return (-1);
645 			}
646 			if ((vmh->vmh_cpuids[i].d & d & VMM_CPUIDEDX_MASK) !=
647 			    (vmh->vmh_cpuids[i].d & VMM_CPUIDEDX_MASK)) {
648 				log_debug("%s: incompatible cpu features "
649 				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
650 				    code, leaf);
651 				return (-1);
652 			}
653 			break;
654 
655 		case 0x07:
656 			CPUID_LEAF(code, leaf, a, b, c, d);
657 			if ((vmh->vmh_cpuids[i].b & b & VMM_SEFF0EBX_MASK) !=
658 			    (vmh->vmh_cpuids[i].b & VMM_SEFF0EBX_MASK)) {
659 				log_debug("%s: incompatible cpu features "
660 				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
661 				    code, leaf);
662 				return (-1);
663 			}
664 			if ((vmh->vmh_cpuids[i].c & c & VMM_SEFF0ECX_MASK) !=
665 			    (vmh->vmh_cpuids[i].c & VMM_SEFF0ECX_MASK)) {
666 				log_debug("%s: incompatible cpu features "
667 				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
668 				    code, leaf);
669 				return (-1);
670 			}
671 			break;
672 
673 		case 0x0d:
674 			CPUID_LEAF(code, leaf, a, b, c, d);
675 			if (vmh->vmh_cpuids[i].b > b) {
676 				log_debug("%s: incompatible cpu: insufficient "
677 				    "max save area for enabled XCR0 features",
678 				    __func__);
679 				return (-1);
680 			}
681 			if (vmh->vmh_cpuids[i].c > c) {
682 				log_debug("%s: incompatible cpu: insufficient "
683 				    "max save area for supported XCR0 features",
684 				    __func__);
685 				return (-1);
686 			}
687 			break;
688 
689 		case 0x80000001:
690 			CPUID_LEAF(code, leaf, a, b, c, d);
691 			if ((vmh->vmh_cpuids[i].a & a) !=
692 			    vmh->vmh_cpuids[i].a) {
693 				log_debug("%s: incompatible cpu features "
694 				    "code: 0x%x leaf: 0x%x  reg: a", __func__,
695 				    code, leaf);
696 				return (-1);
697 			}
698 			if ((vmh->vmh_cpuids[i].c & c) !=
699 			    vmh->vmh_cpuids[i].c) {
700 				log_debug("%s: incompatible cpu features "
701 				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
702 				    code, leaf);
703 				return (-1);
704 			}
705 			if ((vmh->vmh_cpuids[i].d & d) !=
706 			    vmh->vmh_cpuids[i].d) {
707 				log_debug("%s: incompatible cpu features "
708 				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
709 				    code, leaf);
710 				return (-1);
711 			}
712 			break;
713 
714 		default:
715 			log_debug("%s: unknown code 0x%x", __func__, code);
716 			return (-1);
717 		}
718 	}
719 
720 	return (0);
721 }
722 
723 void
724 vmd_sighdlr(int sig, short event, void *arg)
725 {
726 	if (privsep_process != PROC_PARENT)
727 		return;
728 	log_debug("%s: handling signal", __func__);
729 
730 	switch (sig) {
731 	case SIGHUP:
732 		log_info("%s: reload requested with SIGHUP", __func__);
733 
734 		/*
735 		 * This is safe because libevent uses async signal handlers
736 		 * that run in the event loop and not in signal context.
737 		 */
738 		(void)vmd_reload(0, NULL);
739 		break;
740 	case SIGPIPE:
741 		log_info("%s: ignoring SIGPIPE", __func__);
742 		break;
743 	case SIGUSR1:
744 		log_info("%s: ignoring SIGUSR1", __func__);
745 		break;
746 	case SIGTERM:
747 	case SIGINT:
748 		vmd_shutdown();
749 		break;
750 	default:
751 		fatalx("unexpected signal");
752 	}
753 }
754 
755 __dead void
756 usage(void)
757 {
758 	extern char *__progname;
759 	fprintf(stderr, "usage: %s [-dnv] [-D macro=value] [-f file]\n",
760 	    __progname);
761 	exit(1);
762 }
763 
764 int
765 main(int argc, char **argv)
766 {
767 	struct privsep		*ps;
768 	int			 ch;
769 	const char		*conffile = VMD_CONF;
770 	enum privsep_procid	 proc_id = PROC_PARENT;
771 	int			 proc_instance = 0;
772 	const char		*errp, *title = NULL;
773 	int			 argc0 = argc;
774 
775 	log_init(0, LOG_DAEMON);
776 
777 	if ((env = calloc(1, sizeof(*env))) == NULL)
778 		fatal("calloc: env");
779 
780 	while ((ch = getopt(argc, argv, "D:P:I:df:vn")) != -1) {
781 		switch (ch) {
782 		case 'D':
783 			if (cmdline_symset(optarg) < 0)
784 				log_warnx("could not parse macro definition %s",
785 				    optarg);
786 			break;
787 		case 'd':
788 			env->vmd_debug = 2;
789 			break;
790 		case 'f':
791 			conffile = optarg;
792 			break;
793 		case 'v':
794 			env->vmd_verbose++;
795 			break;
796 		case 'n':
797 			env->vmd_noaction = 1;
798 			break;
799 		case 'P':
800 			title = optarg;
801 			proc_id = proc_getid(procs, nitems(procs), title);
802 			if (proc_id == PROC_MAX)
803 				fatalx("invalid process name");
804 			break;
805 		case 'I':
806 			proc_instance = strtonum(optarg, 0,
807 			    PROC_MAX_INSTANCES, &errp);
808 			if (errp)
809 				fatalx("invalid process instance");
810 			break;
811 		default:
812 			usage();
813 		}
814 	}
815 
816 	argc -= optind;
817 	if (argc > 0)
818 		usage();
819 
820 	if (env->vmd_noaction && !env->vmd_debug)
821 		env->vmd_debug = 1;
822 
823 	log_init(env->vmd_debug, LOG_DAEMON);
824 	log_setverbose(env->vmd_verbose);
825 
826 	/* check for root privileges */
827 	if (env->vmd_noaction == 0) {
828 		if (geteuid())
829 			fatalx("need root privileges");
830 	}
831 
832 	ps = &env->vmd_ps;
833 	ps->ps_env = env;
834 	env->vmd_fd = -1;
835 
836 	if (config_init(env) == -1)
837 		fatal("failed to initialize configuration");
838 
839 	if ((ps->ps_pw = getpwnam(VMD_USER)) == NULL)
840 		fatal("unknown user %s", VMD_USER);
841 
842 	/* First proc runs as root without pledge but in default chroot */
843 	proc_priv->p_pw = &proc_privpw; /* initialized to all 0 */
844 	proc_priv->p_chroot = ps->ps_pw->pw_dir; /* from VMD_USER */
845 
846 	/* Open /dev/vmm early. */
847 	if (env->vmd_noaction == 0 && proc_id == PROC_PARENT) {
848 		env->vmd_fd = open(VMM_NODE, O_RDWR);
849 		if (env->vmd_fd == -1)
850 			fatal("%s", VMM_NODE);
851 	}
852 
853 	/* Configure the control socket */
854 	ps->ps_csock.cs_name = SOCKET_NAME;
855 	TAILQ_INIT(&ps->ps_rcsocks);
856 
857 	/* Configuration will be parsed after forking the children */
858 	env->vmd_conffile = conffile;
859 
860 	if (env->vmd_noaction)
861 		ps->ps_noaction = 1;
862 	ps->ps_instance = proc_instance;
863 	if (title != NULL)
864 		ps->ps_title[proc_id] = title;
865 
866 	/* only the parent returns */
867 	proc_init(ps, procs, nitems(procs), env->vmd_debug, argc0, argv,
868 	    proc_id);
869 
870 	log_procinit("parent");
871 	if (!env->vmd_debug && daemon(0, 0) == -1)
872 		fatal("can't daemonize");
873 
874 	if (ps->ps_noaction == 0)
875 		log_info("startup");
876 
877 	event_init();
878 
879 	signal_set(&ps->ps_evsigint, SIGINT, vmd_sighdlr, ps);
880 	signal_set(&ps->ps_evsigterm, SIGTERM, vmd_sighdlr, ps);
881 	signal_set(&ps->ps_evsighup, SIGHUP, vmd_sighdlr, ps);
882 	signal_set(&ps->ps_evsigpipe, SIGPIPE, vmd_sighdlr, ps);
883 	signal_set(&ps->ps_evsigusr1, SIGUSR1, vmd_sighdlr, ps);
884 
885 	signal_add(&ps->ps_evsigint, NULL);
886 	signal_add(&ps->ps_evsigterm, NULL);
887 	signal_add(&ps->ps_evsighup, NULL);
888 	signal_add(&ps->ps_evsigpipe, NULL);
889 	signal_add(&ps->ps_evsigusr1, NULL);
890 
891 	if (!env->vmd_noaction)
892 		proc_connect(ps);
893 
894 	if (vmd_configure() == -1)
895 		fatalx("configuration failed");
896 
897 	event_dispatch();
898 
899 	log_debug("parent exiting");
900 
901 	return (0);
902 }
903 
904 void
905 start_vm_batch(int fd, short type, void *args)
906 {
907 	int		i = 0;
908 	struct vmd_vm	*vm;
909 
910 	log_debug("%s: starting batch of %d vms", __func__,
911 	    env->vmd_cfg.parallelism);
912 	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
913 		if (!(vm->vm_state & VM_STATE_WAITING)) {
914 			log_debug("%s: not starting vm %s (disabled)",
915 			    __func__,
916 			    vm->vm_params.vmc_params.vcp_name);
917 			continue;
918 		}
919 		i++;
920 		if (i > env->vmd_cfg.parallelism) {
921 			evtimer_add(&staggered_start_timer,
922 			    &env->vmd_cfg.delay);
923 			break;
924 		}
925 		vm->vm_state &= ~VM_STATE_WAITING;
926 		config_setvm(&env->vmd_ps, vm, -1, vm->vm_params.vmc_owner.uid);
927 	}
928 	log_debug("%s: done starting vms", __func__);
929 }
930 
931 int
932 vmd_configure(void)
933 {
934 	int			ncpus;
935 	struct vmd_switch	*vsw;
936 	int ncpu_mib[] = {CTL_HW, HW_NCPUONLINE};
937 	size_t ncpus_sz = sizeof(ncpus);
938 
939 	if ((env->vmd_ptmfd = open(PATH_PTMDEV, O_RDWR|O_CLOEXEC)) == -1)
940 		fatal("open %s", PATH_PTMDEV);
941 
942 	/*
943 	 * pledge in the parent process:
944 	 * stdio - for malloc and basic I/O including events.
945 	 * rpath - for reload to open and read the configuration files.
946 	 * wpath - for opening disk images and tap devices.
947 	 * tty - for openpty and TIOCUCNTL.
948 	 * proc - run kill to terminate its children safely.
949 	 * sendfd - for disks, interfaces and other fds.
950 	 * recvfd - for send and receive.
951 	 * getpw - lookup user or group id by name.
952 	 * chown, fattr - change tty ownership
953 	 * flock - locking disk files
954 	 */
955 	if (pledge("stdio rpath wpath proc tty recvfd sendfd getpw"
956 	    " chown fattr flock", NULL) == -1)
957 		fatal("pledge");
958 
959 	if (parse_config(env->vmd_conffile) == -1) {
960 		proc_kill(&env->vmd_ps);
961 		exit(1);
962 	}
963 
964 	if (env->vmd_noaction) {
965 		fprintf(stderr, "configuration OK\n");
966 		proc_kill(&env->vmd_ps);
967 		exit(0);
968 	}
969 
970 	/* Send VMM device fd to vmm proc. */
971 	proc_compose_imsg(&env->vmd_ps, PROC_VMM, -1,
972 	    IMSG_VMDOP_RECEIVE_VMM_FD, -1, env->vmd_fd, NULL, 0);
973 
974 	/* Send shared global configuration to all children */
975 	if (config_setconfig(env) == -1)
976 		return (-1);
977 
978 	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
979 		if (vsw->sw_running)
980 			continue;
981 		if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
982 			log_warn("%s: failed to create switch %s",
983 			    __func__, vsw->sw_name);
984 			switch_remove(vsw);
985 			return (-1);
986 		}
987 	}
988 
989 	if (!(env->vmd_cfg.cfg_flags & VMD_CFG_STAGGERED_START)) {
990 		env->vmd_cfg.delay.tv_sec = VMD_DEFAULT_STAGGERED_START_DELAY;
991 		if (sysctl(ncpu_mib, nitems(ncpu_mib), &ncpus, &ncpus_sz, NULL, 0) == -1)
992 			ncpus = 1;
993 		env->vmd_cfg.parallelism = ncpus;
994 		log_debug("%s: setting staggered start configuration to "
995 		    "parallelism: %d and delay: %lld",
996 		    __func__, ncpus, (long long) env->vmd_cfg.delay.tv_sec);
997 	}
998 
999 	log_debug("%s: starting vms in staggered fashion", __func__);
1000 	evtimer_set(&staggered_start_timer, start_vm_batch, NULL);
1001 	/* start first batch */
1002 	start_vm_batch(0, 0, NULL);
1003 
1004 	return (0);
1005 }
1006 
1007 int
1008 vmd_reload(unsigned int reset, const char *filename)
1009 {
1010 	struct vmd_vm		*vm, *next_vm;
1011 	struct vmd_switch	*vsw;
1012 	int			 reload = 0;
1013 
1014 	/* Switch back to the default config file */
1015 	if (filename == NULL || *filename == '\0') {
1016 		filename = env->vmd_conffile;
1017 		reload = 1;
1018 	}
1019 
1020 	log_debug("%s: level %d config file %s", __func__, reset, filename);
1021 
1022 	if (reset) {
1023 		/* Purge the configuration */
1024 		config_purge(env, reset);
1025 		config_setreset(env, reset);
1026 	} else {
1027 		/*
1028 		 * Load or reload the configuration.
1029 		 *
1030 		 * Reloading removes all non-running VMs before processing the
1031 		 * config file, whereas loading only adds to the existing list
1032 		 * of VMs.
1033 		 */
1034 
1035 		if (reload) {
1036 			TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry,
1037 			    next_vm) {
1038 				if (!(vm->vm_state & VM_STATE_RUNNING)) {
1039 					DPRINTF("%s: calling vm_remove",
1040 					    __func__);
1041 					vm_remove(vm, __func__);
1042 				}
1043 			}
1044 		}
1045 
1046 		if (parse_config(filename) == -1) {
1047 			log_debug("%s: failed to load config file %s",
1048 			    __func__, filename);
1049 			return (-1);
1050 		}
1051 
1052 		if (reload) {
1053 			/* Update shared global configuration in all children */
1054 			if (config_setconfig(env) == -1)
1055 				return (-1);
1056 		}
1057 
1058 		TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1059 			if (vsw->sw_running)
1060 				continue;
1061 			if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
1062 				log_warn("%s: failed to create switch %s",
1063 				    __func__, vsw->sw_name);
1064 				switch_remove(vsw);
1065 				return (-1);
1066 			}
1067 		}
1068 
1069 		log_debug("%s: starting vms in staggered fashion", __func__);
1070 		evtimer_set(&staggered_start_timer, start_vm_batch, NULL);
1071 		/* start first batch */
1072 		start_vm_batch(0, 0, NULL);
1073 
1074 		}
1075 
1076 	return (0);
1077 }
1078 
1079 void
1080 vmd_shutdown(void)
1081 {
1082 	struct vmd_vm *vm, *vm_next;
1083 
1084 	log_debug("%s: performing shutdown", __func__);
1085 
1086 	TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry, vm_next) {
1087 		vm_remove(vm, __func__);
1088 	}
1089 
1090 	proc_kill(&env->vmd_ps);
1091 	free(env);
1092 
1093 	log_warnx("parent terminating");
1094 	exit(0);
1095 }
1096 
1097 struct vmd_vm *
1098 vm_getbyvmid(uint32_t vmid)
1099 {
1100 	struct vmd_vm	*vm;
1101 
1102 	if (vmid == 0)
1103 		return (NULL);
1104 	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1105 		if (vm->vm_vmid == vmid)
1106 			return (vm);
1107 	}
1108 
1109 	return (NULL);
1110 }
1111 
1112 struct vmd_vm *
1113 vm_getbyid(uint32_t id)
1114 {
1115 	struct vmd_vm	*vm;
1116 
1117 	if (id == 0)
1118 		return (NULL);
1119 	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1120 		if (vm->vm_params.vmc_params.vcp_id == id)
1121 			return (vm);
1122 	}
1123 
1124 	return (NULL);
1125 }
1126 
1127 uint32_t
1128 vm_id2vmid(uint32_t id, struct vmd_vm *vm)
1129 {
1130 	if (vm == NULL && (vm = vm_getbyid(id)) == NULL)
1131 		return (0);
1132 	DPRINTF("%s: vmm id %u is vmid %u", __func__,
1133 	    id, vm->vm_vmid);
1134 	return (vm->vm_vmid);
1135 }
1136 
1137 uint32_t
1138 vm_vmid2id(uint32_t vmid, struct vmd_vm *vm)
1139 {
1140 	if (vm == NULL && (vm = vm_getbyvmid(vmid)) == NULL)
1141 		return (0);
1142 	DPRINTF("%s: vmid %u is vmm id %u", __func__,
1143 	    vmid, vm->vm_params.vmc_params.vcp_id);
1144 	return (vm->vm_params.vmc_params.vcp_id);
1145 }
1146 
1147 struct vmd_vm *
1148 vm_getbyname(const char *name)
1149 {
1150 	struct vmd_vm	*vm;
1151 
1152 	if (name == NULL)
1153 		return (NULL);
1154 	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1155 		if (strcmp(vm->vm_params.vmc_params.vcp_name, name) == 0)
1156 			return (vm);
1157 	}
1158 
1159 	return (NULL);
1160 }
1161 
1162 struct vmd_vm *
1163 vm_getbypid(pid_t pid)
1164 {
1165 	struct vmd_vm	*vm;
1166 
1167 	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1168 		if (vm->vm_pid == pid)
1169 			return (vm);
1170 	}
1171 
1172 	return (NULL);
1173 }
1174 
1175 void
1176 vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
1177 {
1178 	struct privsep	*ps = &env->vmd_ps;
1179 	unsigned int	 i, j;
1180 
1181 	if (vm == NULL)
1182 		return;
1183 
1184 	log_debug("%s: %s %s stopping vm %d%s",
1185 	    __func__, ps->ps_title[privsep_process], caller,
1186 	    vm->vm_vmid, keeptty ? ", keeping tty open" : "");
1187 
1188 	vm->vm_state &= ~(VM_STATE_RECEIVED | VM_STATE_RUNNING
1189 	    | VM_STATE_SHUTDOWN);
1190 
1191 	if (vm->vm_iev.ibuf.fd != -1) {
1192 		event_del(&vm->vm_iev.ev);
1193 		close(vm->vm_iev.ibuf.fd);
1194 	}
1195 	for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++) {
1196 		for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
1197 			if (vm->vm_disks[i][j] != -1) {
1198 				close(vm->vm_disks[i][j]);
1199 				vm->vm_disks[i][j] = -1;
1200 			}
1201 		}
1202 	}
1203 	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) {
1204 		if (vm->vm_ifs[i].vif_fd != -1) {
1205 			close(vm->vm_ifs[i].vif_fd);
1206 			vm->vm_ifs[i].vif_fd = -1;
1207 		}
1208 		free(vm->vm_ifs[i].vif_name);
1209 		free(vm->vm_ifs[i].vif_switch);
1210 		free(vm->vm_ifs[i].vif_group);
1211 		vm->vm_ifs[i].vif_name = NULL;
1212 		vm->vm_ifs[i].vif_switch = NULL;
1213 		vm->vm_ifs[i].vif_group = NULL;
1214 	}
1215 	if (vm->vm_kernel != -1) {
1216 		close(vm->vm_kernel);
1217 		vm->vm_kernel = -1;
1218 	}
1219 	if (vm->vm_cdrom != -1) {
1220 		close(vm->vm_cdrom);
1221 		vm->vm_cdrom = -1;
1222 	}
1223 	if (!keeptty) {
1224 		vm_closetty(vm);
1225 		vm->vm_uid = 0;
1226 	}
1227 }
1228 
1229 void
1230 vm_remove(struct vmd_vm *vm, const char *caller)
1231 {
1232 	struct privsep	*ps = &env->vmd_ps;
1233 
1234 	if (vm == NULL)
1235 		return;
1236 
1237 	log_debug("%s: %s %s removing vm %d from running config",
1238 	    __func__, ps->ps_title[privsep_process], caller,
1239 	    vm->vm_vmid);
1240 
1241 	TAILQ_REMOVE(env->vmd_vms, vm, vm_entry);
1242 
1243 	vm_stop(vm, 0, caller);
1244 	free(vm);
1245 }
1246 
1247 int
1248 vm_claimid(const char *name, int uid, uint32_t *id)
1249 {
1250 	struct name2id *n2i = NULL;
1251 
1252 	TAILQ_FOREACH(n2i, env->vmd_known, entry)
1253 		if (strcmp(n2i->name, name) == 0 && n2i->uid == uid)
1254 			goto out;
1255 
1256 	if (++env->vmd_nvm == 0) {
1257 		log_warnx("too many vms");
1258 		return (-1);
1259 	}
1260 	if ((n2i = calloc(1, sizeof(struct name2id))) == NULL) {
1261 		log_warnx("could not alloc vm name");
1262 		return (-1);
1263 	}
1264 	n2i->id = env->vmd_nvm;
1265 	n2i->uid = uid;
1266 	if (strlcpy(n2i->name, name, sizeof(n2i->name)) >= sizeof(n2i->name)) {
1267 		log_warnx("vm name too long");
1268 		free(n2i);
1269 		return (-1);
1270 	}
1271 	TAILQ_INSERT_TAIL(env->vmd_known, n2i, entry);
1272 
1273 out:
1274 	*id = n2i->id;
1275 	return (0);
1276 }
1277 
1278 int
1279 vm_register(struct privsep *ps, struct vmop_create_params *vmc,
1280     struct vmd_vm **ret_vm, uint32_t id, uid_t uid)
1281 {
1282 	struct vmd_vm		*vm = NULL, *vm_parent = NULL;
1283 	struct vm_create_params	*vcp = &vmc->vmc_params;
1284 	struct vmop_owner	*vmo = NULL;
1285 	uint32_t		 nid, rng;
1286 	unsigned int		 i, j;
1287 	struct vmd_switch	*sw;
1288 	char			*s;
1289 	int			 ret = 0;
1290 
1291 	/* Check if this is an instance of another VM */
1292 	if ((ret = vm_instance(ps, &vm_parent, vmc, uid)) != 0) {
1293 		errno = ret; /* XXX might set invalid errno */
1294 		return (-1);
1295 	}
1296 
1297 	errno = 0;
1298 	*ret_vm = NULL;
1299 
1300 	if ((vm = vm_getbyname(vcp->vcp_name)) != NULL ||
1301 	    (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) {
1302 		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
1303 		    uid) != 0) {
1304 			errno = EPERM;
1305 			goto fail;
1306 		}
1307 		*ret_vm = vm;
1308 		errno = EALREADY;
1309 		goto fail;
1310 	}
1311 
1312 	if (vm_parent != NULL)
1313 		vmo = &vm_parent->vm_params.vmc_insowner;
1314 
1315 	/* non-root users can only start existing VMs or instances */
1316 	if (vm_checkperm(NULL, vmo, uid) != 0) {
1317 		log_warnx("permission denied");
1318 		errno = EPERM;
1319 		goto fail;
1320 	}
1321 	if (vmc->vmc_flags == 0) {
1322 		log_warnx("invalid configuration, no devices");
1323 		errno = VMD_DISK_MISSING;
1324 		goto fail;
1325 	}
1326 	if (vcp->vcp_ncpus == 0)
1327 		vcp->vcp_ncpus = 1;
1328 	if (vcp->vcp_memranges[0].vmr_size == 0)
1329 		vcp->vcp_memranges[0].vmr_size = VM_DEFAULT_MEMORY;
1330 	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) {
1331 		log_warnx("invalid number of CPUs");
1332 		goto fail;
1333 	} else if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) {
1334 		log_warnx("invalid number of disks");
1335 		goto fail;
1336 	} else if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) {
1337 		log_warnx("invalid number of interfaces");
1338 		goto fail;
1339 	} else if (strlen(vcp->vcp_kernel) == 0 &&
1340 	    vcp->vcp_ndisks == 0 && strlen(vcp->vcp_cdrom) == 0) {
1341 		log_warnx("no kernel or disk/cdrom specified");
1342 		goto fail;
1343 	} else if (strlen(vcp->vcp_name) == 0) {
1344 		log_warnx("invalid VM name");
1345 		goto fail;
1346 	} else if (*vcp->vcp_name == '-' || *vcp->vcp_name == '.' ||
1347 	    *vcp->vcp_name == '_') {
1348 		log_warnx("invalid VM name");
1349 		goto fail;
1350 	} else {
1351 		for (s = vcp->vcp_name; *s != '\0'; ++s) {
1352 			if (!(isalnum((unsigned char)*s) || *s == '.' || \
1353 			    *s == '-' || *s == '_')) {
1354 				log_warnx("invalid VM name");
1355 				goto fail;
1356 			}
1357 		}
1358 	}
1359 
1360 	if ((vm = calloc(1, sizeof(*vm))) == NULL)
1361 		goto fail;
1362 
1363 	memcpy(&vm->vm_params, vmc, sizeof(vm->vm_params));
1364 	vmc = &vm->vm_params;
1365 	vcp = &vmc->vmc_params;
1366 	vm->vm_pid = -1;
1367 	vm->vm_tty = -1;
1368 	vm->vm_receive_fd = -1;
1369 	vm->vm_state &= ~VM_STATE_PAUSED;
1370 
1371 	for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++)
1372 		for (j = 0; j < VM_MAX_BASE_PER_DISK; j++)
1373 			vm->vm_disks[i][j] = -1;
1374 	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
1375 		vm->vm_ifs[i].vif_fd = -1;
1376 	for (i = 0; i < vcp->vcp_nnics; i++) {
1377 		if ((sw = switch_getbyname(vmc->vmc_ifswitch[i])) != NULL) {
1378 			/* inherit per-interface flags from the switch */
1379 			vmc->vmc_ifflags[i] |= (sw->sw_flags & VMIFF_OPTMASK);
1380 		}
1381 
1382 		/*
1383 		 * If the MAC address is zero, always randomize it in vmd(8)
1384 		 * because we cannot rely on the guest OS to do the right
1385 		 * thing like OpenBSD does.  Based on ether_fakeaddr()
1386 		 * from the kernel, incremented by one to differentiate
1387 		 * the source.
1388 		 */
1389 		if (memcmp(zero_mac, &vcp->vcp_macs[i], ETHER_ADDR_LEN) == 0) {
1390 			rng = arc4random();
1391 			vcp->vcp_macs[i][0] = 0xfe;
1392 			vcp->vcp_macs[i][1] = 0xe1;
1393 			vcp->vcp_macs[i][2] = 0xba + 1;
1394 			vcp->vcp_macs[i][3] = 0xd0 | ((i + 1) & 0xf);
1395 			vcp->vcp_macs[i][4] = rng;
1396 			vcp->vcp_macs[i][5] = rng >> 8;
1397 		}
1398 	}
1399 	vm->vm_kernel = -1;
1400 	vm->vm_cdrom = -1;
1401 	vm->vm_iev.ibuf.fd = -1;
1402 
1403 	/*
1404 	 * Assign a new internal Id if not specified and we succeed in
1405 	 * claiming a new Id.
1406 	 */
1407 	if (id != 0)
1408 		vm->vm_vmid = id;
1409 	else if (vm_claimid(vcp->vcp_name, uid, &nid) == -1)
1410 		goto fail;
1411 	else
1412 		vm->vm_vmid = nid;
1413 
1414 	log_debug("%s: registering vm %d", __func__, vm->vm_vmid);
1415 	TAILQ_INSERT_TAIL(env->vmd_vms, vm, vm_entry);
1416 
1417 	*ret_vm = vm;
1418 	return (0);
1419  fail:
1420 	if (errno == 0)
1421 		errno = EINVAL;
1422 	return (-1);
1423 }
1424 
1425 int
1426 vm_instance(struct privsep *ps, struct vmd_vm **vm_parent,
1427     struct vmop_create_params *vmc, uid_t uid)
1428 {
1429 	char			*name;
1430 	struct vm_create_params	*vcp = &vmc->vmc_params;
1431 	struct vmop_create_params *vmcp;
1432 	struct vm_create_params	*vcpp;
1433 	struct vmd_vm		*vm = NULL;
1434 	unsigned int		 i, j;
1435 
1436 	/* return without error if the parent is NULL (nothing to inherit) */
1437 	if ((vmc->vmc_flags & VMOP_CREATE_INSTANCE) == 0 ||
1438 	    vmc->vmc_instance[0] == '\0')
1439 		return (0);
1440 
1441 	if ((*vm_parent = vm_getbyname(vmc->vmc_instance)) == NULL) {
1442 		return (VMD_PARENT_INVALID);
1443 	}
1444 
1445 	vmcp = &(*vm_parent)->vm_params;
1446 	vcpp = &vmcp->vmc_params;
1447 
1448 	/* Are we allowed to create an instance from this VM? */
1449 	if (vm_checkperm(NULL, &vmcp->vmc_insowner, uid) != 0) {
1450 		log_warnx("vm \"%s\" no permission to create vm instance",
1451 		    vcpp->vcp_name);
1452 		return (ENAMETOOLONG);
1453 	}
1454 
1455 	name = vcp->vcp_name;
1456 
1457 	if ((vm = vm_getbyname(vcp->vcp_name)) != NULL ||
1458 	    (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) {
1459 		return (EPROCLIM);
1460 	}
1461 
1462 	/* CPU */
1463 	if (vcp->vcp_ncpus == 0)
1464 		vcp->vcp_ncpus = vcpp->vcp_ncpus;
1465 	if (vm_checkinsflag(vmcp, VMOP_CREATE_CPU, uid) != 0 &&
1466 	    vcp->vcp_ncpus != vcpp->vcp_ncpus) {
1467 		log_warnx("vm \"%s\" no permission to set cpus", name);
1468 		return (EPERM);
1469 	}
1470 
1471 	/* memory */
1472 	if (vcp->vcp_memranges[0].vmr_size == 0)
1473 		vcp->vcp_memranges[0].vmr_size =
1474 		    vcpp->vcp_memranges[0].vmr_size;
1475 	if (vm_checkinsflag(vmcp, VMOP_CREATE_MEMORY, uid) != 0 &&
1476 	    vcp->vcp_memranges[0].vmr_size !=
1477 	    vcpp->vcp_memranges[0].vmr_size) {
1478 		log_warnx("vm \"%s\" no permission to set memory", name);
1479 		return (EPERM);
1480 	}
1481 
1482 	/* disks cannot be inherited */
1483 	if (vm_checkinsflag(vmcp, VMOP_CREATE_DISK, uid) != 0 &&
1484 	    vcp->vcp_ndisks) {
1485 		log_warnx("vm \"%s\" no permission to set disks", name);
1486 		return (EPERM);
1487 	}
1488 	for (i = 0; i < vcp->vcp_ndisks; i++) {
1489 		/* Check if this disk is already used in the parent */
1490 		for (j = 0; j < vcpp->vcp_ndisks; j++) {
1491 			if (strcmp(vcp->vcp_disks[i],
1492 			    vcpp->vcp_disks[j]) == 0) {
1493 				log_warnx("vm \"%s\" disk %s cannot be reused",
1494 				    name, vcp->vcp_disks[i]);
1495 				return (EBUSY);
1496 			}
1497 		}
1498 		vmc->vmc_checkaccess |= VMOP_CREATE_DISK;
1499 	}
1500 
1501 	/* interfaces */
1502 	if (vcp->vcp_nnics > 0 &&
1503 	    vm_checkinsflag(vmcp, VMOP_CREATE_NETWORK, uid) != 0 &&
1504 	    vcp->vcp_nnics != vcpp->vcp_nnics) {
1505 		log_warnx("vm \"%s\" no permission to set interfaces", name);
1506 		return (EPERM);
1507 	}
1508 	for (i = 0; i < vcpp->vcp_nnics; i++) {
1509 		/* Interface got overwritten */
1510 		if (i < vcp->vcp_nnics)
1511 			continue;
1512 
1513 		/* Copy interface from parent */
1514 		vmc->vmc_ifflags[i] = vmcp->vmc_ifflags[i];
1515 		(void)strlcpy(vmc->vmc_ifnames[i], vmcp->vmc_ifnames[i],
1516 		    sizeof(vmc->vmc_ifnames[i]));
1517 		(void)strlcpy(vmc->vmc_ifswitch[i], vmcp->vmc_ifswitch[i],
1518 		    sizeof(vmc->vmc_ifswitch[i]));
1519 		(void)strlcpy(vmc->vmc_ifgroup[i], vmcp->vmc_ifgroup[i],
1520 		    sizeof(vmc->vmc_ifgroup[i]));
1521 		memcpy(vcp->vcp_macs[i], vcpp->vcp_macs[i],
1522 		    sizeof(vcp->vcp_macs[i]));
1523 		vmc->vmc_ifrdomain[i] = vmcp->vmc_ifrdomain[i];
1524 		vcp->vcp_nnics++;
1525 	}
1526 	for (i = 0; i < vcp->vcp_nnics; i++) {
1527 		for (j = 0; j < vcpp->vcp_nnics; j++) {
1528 			if (memcmp(zero_mac, vcp->vcp_macs[i],
1529 			    sizeof(vcp->vcp_macs[i])) != 0 &&
1530 			    memcmp(vcpp->vcp_macs[i], vcp->vcp_macs[i],
1531 			    sizeof(vcp->vcp_macs[i])) != 0) {
1532 				log_warnx("vm \"%s\" lladdr cannot be reused",
1533 				    name);
1534 				return (EBUSY);
1535 			}
1536 			if (strlen(vmc->vmc_ifnames[i]) &&
1537 			    strcmp(vmc->vmc_ifnames[i],
1538 			    vmcp->vmc_ifnames[j]) == 0) {
1539 				log_warnx("vm \"%s\" %s cannot be reused",
1540 				    vmc->vmc_ifnames[i], name);
1541 				return (EBUSY);
1542 			}
1543 		}
1544 	}
1545 
1546 	/* kernel */
1547 	if (strlen(vcp->vcp_kernel) > 0) {
1548 		if (vm_checkinsflag(vmcp, VMOP_CREATE_KERNEL, uid) != 0) {
1549 			log_warnx("vm \"%s\" no permission to set boot image",
1550 			    name);
1551 			return (EPERM);
1552 		}
1553 		vmc->vmc_checkaccess |= VMOP_CREATE_KERNEL;
1554 	} else if (strlcpy(vcp->vcp_kernel, vcpp->vcp_kernel,
1555 	    sizeof(vcp->vcp_kernel)) >= sizeof(vcp->vcp_kernel)) {
1556 		log_warnx("vm \"%s\" kernel name too long", name);
1557 		return (EINVAL);
1558 	}
1559 
1560 	/* cdrom */
1561 	if (strlen(vcp->vcp_cdrom) > 0) {
1562 		if (vm_checkinsflag(vmcp, VMOP_CREATE_CDROM, uid) != 0) {
1563 			log_warnx("vm \"%s\" no permission to set cdrom", name);
1564 			return (EPERM);
1565 		}
1566 		vmc->vmc_checkaccess |= VMOP_CREATE_CDROM;
1567 	} else if (strlcpy(vcp->vcp_cdrom, vcpp->vcp_cdrom,
1568 	    sizeof(vcp->vcp_cdrom)) >= sizeof(vcp->vcp_cdrom)) {
1569 		log_warnx("vm \"%s\" cdrom name too long", name);
1570 		return (EINVAL);
1571 	}
1572 
1573 	/* user */
1574 	if (vmc->vmc_owner.uid == 0)
1575 		vmc->vmc_owner.uid = vmcp->vmc_owner.uid;
1576 	else if (vmc->vmc_owner.uid != uid &&
1577 	    vmc->vmc_owner.uid != vmcp->vmc_owner.uid) {
1578 		log_warnx("vm \"%s\" user mismatch", name);
1579 		return (EPERM);
1580 	}
1581 
1582 	/* group */
1583 	if (vmc->vmc_owner.gid == 0)
1584 		vmc->vmc_owner.gid = vmcp->vmc_owner.gid;
1585 	else if (vmc->vmc_owner.gid != vmcp->vmc_owner.gid) {
1586 		log_warnx("vm \"%s\" group mismatch", name);
1587 		return (EPERM);
1588 	}
1589 
1590 	/* child instances */
1591 	if (vmc->vmc_insflags) {
1592 		log_warnx("vm \"%s\" cannot change instance permissions", name);
1593 		return (EPERM);
1594 	}
1595 	if (vmcp->vmc_insflags & VMOP_CREATE_INSTANCE) {
1596 		vmc->vmc_insowner.gid = vmcp->vmc_insowner.gid;
1597 		vmc->vmc_insowner.uid = vmcp->vmc_insowner.gid;
1598 		vmc->vmc_insflags = vmcp->vmc_insflags;
1599 	} else {
1600 		vmc->vmc_insowner.gid = 0;
1601 		vmc->vmc_insowner.uid = 0;
1602 		vmc->vmc_insflags = 0;
1603 	}
1604 
1605 	/* finished, remove instance flags */
1606 	vmc->vmc_flags &= ~VMOP_CREATE_INSTANCE;
1607 
1608 	return (0);
1609 }
1610 
1611 /*
1612  * vm_checkperm
1613  *
1614  * Checks if the user represented by the 'uid' parameter is allowed to
1615  * manipulate the VM described by the 'vm' parameter (or connect to said VM's
1616  * console.)
1617  *
1618  * Parameters:
1619  *  vm: the VM whose permission is to be checked
1620  *  vmo: the required uid/gid to be checked
1621  *  uid: the user ID of the user making the request
1622  *
1623  * Return values:
1624  *   0: the permission should be granted
1625  *  -1: the permission check failed (also returned if vm == null)
1626  */
1627 int
1628 vm_checkperm(struct vmd_vm *vm, struct vmop_owner *vmo, uid_t uid)
1629 {
1630 	struct group	*gr;
1631 	struct passwd	*pw;
1632 	char		**grmem;
1633 
1634 	/* root has no restrictions */
1635 	if (uid == 0)
1636 		return (0);
1637 
1638 	if (vmo == NULL)
1639 		return (-1);
1640 
1641 	/* check user */
1642 	if (vm == NULL) {
1643 		if  (vmo->uid == uid)
1644 			return (0);
1645 	} else {
1646 		/*
1647 		 * check user of running vm (the owner of a running vm can
1648 		 * be different to (or more specific than) the configured owner.
1649 		 */
1650 		if (((vm->vm_state & VM_STATE_RUNNING) && vm->vm_uid == uid) ||
1651 		    (!(vm->vm_state & VM_STATE_RUNNING) && vmo->uid == uid))
1652 			return (0);
1653 	}
1654 
1655 	/* check groups */
1656 	if (vmo->gid != -1) {
1657 		if ((pw = getpwuid(uid)) == NULL)
1658 			return (-1);
1659 		if (pw->pw_gid == vmo->gid)
1660 			return (0);
1661 		if ((gr = getgrgid(vmo->gid)) != NULL) {
1662 			for (grmem = gr->gr_mem; *grmem; grmem++)
1663 				if (strcmp(*grmem, pw->pw_name) == 0)
1664 					return (0);
1665 		}
1666 	}
1667 
1668 	return (-1);
1669 }
1670 
1671 /*
1672  * vm_checkinsflag
1673  *
1674  * Checks whether the non-root user is allowed to set an instance option.
1675  *
1676  * Parameters:
1677  *  vmc: the VM create parameters
1678  *  flag: the flag to be checked
1679  *  uid: the user ID of the user making the request
1680  *
1681  * Return values:
1682  *   0: the permission should be granted
1683  *  -1: the permission check failed (also returned if vm == null)
1684  */
1685 int
1686 vm_checkinsflag(struct vmop_create_params *vmc, unsigned int flag, uid_t uid)
1687 {
1688 	/* root has no restrictions */
1689 	if (uid == 0)
1690 		return (0);
1691 
1692 	if ((vmc->vmc_insflags & flag) == 0)
1693 		return (-1);
1694 
1695 	return (0);
1696 }
1697 
1698 /*
1699  * vm_checkaccess
1700  *
1701  * Checks if the user represented by the 'uid' parameter is allowed to
1702  * access the file described by the 'path' parameter.
1703  *
1704  * Parameters:
1705  *  fd: the file descriptor of the opened file
1706  *  uflag: check if the userid has access to the file
1707  *  uid: the user ID of the user making the request
1708  *  amode: the access flags of R_OK and W_OK
1709  *
1710  * Return values:
1711  *   0: the permission should be granted
1712  *  -1: the permission check failed
1713  */
1714 int
1715 vm_checkaccess(int fd, unsigned int uflag, uid_t uid, int amode)
1716 {
1717 	struct group	*gr;
1718 	struct passwd	*pw;
1719 	char		**grmem;
1720 	struct stat	 st;
1721 	mode_t		 mode;
1722 
1723 	if (fd == -1)
1724 		return (-1);
1725 
1726 	/*
1727 	 * File has to be accessible and a regular file
1728 	 */
1729 	if (fstat(fd, &st) == -1 || !S_ISREG(st.st_mode))
1730 		return (-1);
1731 
1732 	/* root has no restrictions */
1733 	if (uid == 0 || uflag == 0)
1734 		return (0);
1735 
1736 	/* check other */
1737 	mode = amode & W_OK ? S_IWOTH : 0;
1738 	mode |= amode & R_OK ? S_IROTH : 0;
1739 	if ((st.st_mode & mode) == mode)
1740 		return (0);
1741 
1742 	/* check user */
1743 	mode = amode & W_OK ? S_IWUSR : 0;
1744 	mode |= amode & R_OK ? S_IRUSR : 0;
1745 	if (uid == st.st_uid && (st.st_mode & mode) == mode)
1746 		return (0);
1747 
1748 	/* check groups */
1749 	mode = amode & W_OK ? S_IWGRP : 0;
1750 	mode |= amode & R_OK ? S_IRGRP : 0;
1751 	if ((st.st_mode & mode) != mode)
1752 		return (-1);
1753 	if ((pw = getpwuid(uid)) == NULL)
1754 		return (-1);
1755 	if (pw->pw_gid == st.st_gid)
1756 		return (0);
1757 	if ((gr = getgrgid(st.st_gid)) != NULL) {
1758 		for (grmem = gr->gr_mem; *grmem; grmem++)
1759 			if (strcmp(*grmem, pw->pw_name) == 0)
1760 				return (0);
1761 	}
1762 
1763 	return (-1);
1764 }
1765 
1766 int
1767 vm_opentty(struct vmd_vm *vm)
1768 {
1769 	struct ptmget		 ptm;
1770 	struct stat		 st;
1771 	struct group		*gr;
1772 	uid_t			 uid;
1773 	gid_t			 gid;
1774 	mode_t			 mode;
1775 	int			 on;
1776 
1777 	/*
1778 	 * Open tty with pre-opened PTM fd
1779 	 */
1780 	if ((ioctl(env->vmd_ptmfd, PTMGET, &ptm) == -1))
1781 		return (-1);
1782 
1783 	/*
1784 	 * We use user ioctl(2) mode to pass break commands.
1785 	 */
1786 	on = 1;
1787 	if (ioctl(ptm.cfd, TIOCUCNTL, &on) == -1)
1788 		fatal("could not enable user ioctl mode");
1789 
1790 	vm->vm_tty = ptm.cfd;
1791 	close(ptm.sfd);
1792 	if ((vm->vm_ttyname = strdup(ptm.sn)) == NULL)
1793 		goto fail;
1794 
1795 	uid = vm->vm_uid;
1796 	gid = vm->vm_params.vmc_owner.gid;
1797 
1798 	if (vm->vm_params.vmc_owner.gid != -1) {
1799 		mode = 0660;
1800 	} else if ((gr = getgrnam("tty")) != NULL) {
1801 		gid = gr->gr_gid;
1802 		mode = 0620;
1803 	} else {
1804 		mode = 0600;
1805 		gid = 0;
1806 	}
1807 
1808 	log_debug("%s: vm %s tty %s uid %d gid %d mode %o",
1809 	    __func__, vm->vm_params.vmc_params.vcp_name,
1810 	    vm->vm_ttyname, uid, gid, mode);
1811 
1812 	/*
1813 	 * Change ownership and mode of the tty as required.
1814 	 * Loosely based on the implementation of sshpty.c
1815 	 */
1816 	if (stat(vm->vm_ttyname, &st) == -1)
1817 		goto fail;
1818 
1819 	if (st.st_uid != uid || st.st_gid != gid) {
1820 		if (chown(vm->vm_ttyname, uid, gid) == -1) {
1821 			log_warn("chown %s %d %d failed, uid %d",
1822 			    vm->vm_ttyname, uid, gid, getuid());
1823 
1824 			/* Ignore failure on read-only filesystems */
1825 			if (!((errno == EROFS) &&
1826 			    (st.st_uid == uid || st.st_uid == 0)))
1827 				goto fail;
1828 		}
1829 	}
1830 
1831 	if ((st.st_mode & (S_IRWXU|S_IRWXG|S_IRWXO)) != mode) {
1832 		if (chmod(vm->vm_ttyname, mode) == -1) {
1833 			log_warn("chmod %s %o failed, uid %d",
1834 			    vm->vm_ttyname, mode, getuid());
1835 
1836 			/* Ignore failure on read-only filesystems */
1837 			if (!((errno == EROFS) &&
1838 			    (st.st_uid == uid || st.st_uid == 0)))
1839 				goto fail;
1840 		}
1841 	}
1842 
1843 	return (0);
1844  fail:
1845 	vm_closetty(vm);
1846 	return (-1);
1847 }
1848 
1849 void
1850 vm_closetty(struct vmd_vm *vm)
1851 {
1852 	if (vm->vm_tty != -1) {
1853 		/* Release and close the tty */
1854 		if (fchown(vm->vm_tty, 0, 0) == -1)
1855 			log_warn("chown %s 0 0 failed", vm->vm_ttyname);
1856 		if (fchmod(vm->vm_tty, 0666) == -1)
1857 			log_warn("chmod %s 0666 failed", vm->vm_ttyname);
1858 		close(vm->vm_tty);
1859 		vm->vm_tty = -1;
1860 	}
1861 	free(vm->vm_ttyname);
1862 	vm->vm_ttyname = NULL;
1863 }
1864 
1865 void
1866 switch_remove(struct vmd_switch *vsw)
1867 {
1868 	if (vsw == NULL)
1869 		return;
1870 
1871 	TAILQ_REMOVE(env->vmd_switches, vsw, sw_entry);
1872 
1873 	free(vsw->sw_group);
1874 	free(vsw->sw_name);
1875 	free(vsw);
1876 }
1877 
1878 struct vmd_switch *
1879 switch_getbyname(const char *name)
1880 {
1881 	struct vmd_switch	*vsw;
1882 
1883 	if (name == NULL)
1884 		return (NULL);
1885 	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1886 		if (strcmp(vsw->sw_name, name) == 0)
1887 			return (vsw);
1888 	}
1889 
1890 	return (NULL);
1891 }
1892 
1893 char *
1894 get_string(uint8_t *ptr, size_t len)
1895 {
1896 	size_t	 i;
1897 
1898 	for (i = 0; i < len; i++)
1899 		if (!isprint((unsigned char)ptr[i]))
1900 			break;
1901 
1902 	return strndup(ptr, i);
1903 }
1904 
1905 uint32_t
1906 prefixlen2mask(uint8_t prefixlen)
1907 {
1908 	if (prefixlen == 0)
1909 		return (0);
1910 
1911 	if (prefixlen > 32)
1912 		prefixlen = 32;
1913 
1914 	return (htonl(0xffffffff << (32 - prefixlen)));
1915 }
1916 
1917 void
1918 prefixlen2mask6(uint8_t prefixlen, struct in6_addr *mask)
1919 {
1920 	struct in6_addr	 s6;
1921 	int		 i;
1922 
1923 	if (prefixlen > 128)
1924 		prefixlen = 128;
1925 
1926 	memset(&s6, 0, sizeof(s6));
1927 	for (i = 0; i < prefixlen / 8; i++)
1928 		s6.s6_addr[i] = 0xff;
1929 	i = prefixlen % 8;
1930 	if (i)
1931 		s6.s6_addr[prefixlen / 8] = 0xff00 >> i;
1932 
1933 	memcpy(mask, &s6, sizeof(s6));
1934 }
1935 
1936 void
1937 getmonotime(struct timeval *tv)
1938 {
1939 	struct timespec	 ts;
1940 
1941 	if (clock_gettime(CLOCK_MONOTONIC, &ts))
1942 		fatal("clock_gettime");
1943 
1944 	TIMESPEC_TO_TIMEVAL(tv, &ts);
1945 }
1946 
1947 static inline void
1948 vm_terminate(struct vmd_vm *vm, const char *caller)
1949 {
1950 	if (vm->vm_from_config)
1951 		vm_stop(vm, 0, caller);
1952 	else {
1953 		/* vm_remove calls vm_stop */
1954 		vm_remove(vm, caller);
1955 	}
1956 }
1957 
1958