xref: /netbsd-src/sys/kern/kern_exec.c (revision fb5eed702691094bd687fbf1ded189c87457cd35)
1 /*	$NetBSD: kern_exec.c,v 1.511 2021/11/07 13:47:49 christos Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008, 2019, 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*-
33  * Copyright (C) 1993, 1994, 1996 Christopher G. Demetriou
34  * Copyright (C) 1992 Wolfgang Solfrank.
35  * Copyright (C) 1992 TooLs GmbH.
36  * All rights reserved.
37  *
38  * Redistribution and use in source and binary forms, with or without
39  * modification, are permitted provided that the following conditions
40  * are met:
41  * 1. Redistributions of source code must retain the above copyright
42  *    notice, this list of conditions and the following disclaimer.
43  * 2. Redistributions in binary form must reproduce the above copyright
44  *    notice, this list of conditions and the following disclaimer in the
45  *    documentation and/or other materials provided with the distribution.
46  * 3. All advertising materials mentioning features or use of this software
47  *    must display the following acknowledgement:
48  *	This product includes software developed by TooLs GmbH.
49  * 4. The name of TooLs GmbH may not be used to endorse or promote products
50  *    derived from this software without specific prior written permission.
51  *
52  * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
53  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
54  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
55  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
56  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
57  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
58  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
59  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
60  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
61  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62  */
63 
64 #include <sys/cdefs.h>
65 __KERNEL_RCSID(0, "$NetBSD: kern_exec.c,v 1.511 2021/11/07 13:47:49 christos Exp $");
66 
67 #include "opt_exec.h"
68 #include "opt_execfmt.h"
69 #include "opt_ktrace.h"
70 #include "opt_modular.h"
71 #include "opt_syscall_debug.h"
72 #include "veriexec.h"
73 #include "opt_pax.h"
74 
75 #include <sys/param.h>
76 #include <sys/systm.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/proc.h>
80 #include <sys/ptrace.h>
81 #include <sys/mount.h>
82 #include <sys/kmem.h>
83 #include <sys/namei.h>
84 #include <sys/vnode.h>
85 #include <sys/file.h>
86 #include <sys/filedesc.h>
87 #include <sys/acct.h>
88 #include <sys/atomic.h>
89 #include <sys/exec.h>
90 #include <sys/futex.h>
91 #include <sys/ktrace.h>
92 #include <sys/uidinfo.h>
93 #include <sys/wait.h>
94 #include <sys/mman.h>
95 #include <sys/ras.h>
96 #include <sys/signalvar.h>
97 #include <sys/stat.h>
98 #include <sys/syscall.h>
99 #include <sys/kauth.h>
100 #include <sys/lwpctl.h>
101 #include <sys/pax.h>
102 #include <sys/cpu.h>
103 #include <sys/module.h>
104 #include <sys/syscallvar.h>
105 #include <sys/syscallargs.h>
106 #include <sys/vfs_syscalls.h>
107 #if NVERIEXEC > 0
108 #include <sys/verified_exec.h>
109 #endif /* NVERIEXEC > 0 */
110 #include <sys/sdt.h>
111 #include <sys/spawn.h>
112 #include <sys/prot.h>
113 #include <sys/cprng.h>
114 
115 #include <uvm/uvm_extern.h>
116 
117 #include <machine/reg.h>
118 
119 #include <compat/common/compat_util.h>
120 
121 #ifndef MD_TOPDOWN_INIT
122 #ifdef __USE_TOPDOWN_VM
123 #define	MD_TOPDOWN_INIT(epp)	(epp)->ep_flags |= EXEC_TOPDOWN_VM
124 #else
125 #define	MD_TOPDOWN_INIT(epp)
126 #endif
127 #endif
128 
129 struct execve_data;
130 
131 extern int user_va0_disable;
132 
133 static size_t calcargs(struct execve_data * restrict, const size_t);
134 static size_t calcstack(struct execve_data * restrict, const size_t);
135 static int copyoutargs(struct execve_data * restrict, struct lwp *,
136     char * const);
137 static int copyoutpsstrs(struct execve_data * restrict, struct proc *);
138 static int copyinargs(struct execve_data * restrict, char * const *,
139     char * const *, execve_fetch_element_t, char **);
140 static int copyinargstrs(struct execve_data * restrict, char * const *,
141     execve_fetch_element_t, char **, size_t *, void (*)(const void *, size_t));
142 static int exec_sigcode_map(struct proc *, const struct emul *);
143 
144 #if defined(DEBUG) && !defined(DEBUG_EXEC)
145 #define DEBUG_EXEC
146 #endif
147 #ifdef DEBUG_EXEC
148 #define DPRINTF(a) printf a
149 #define COPYPRINTF(s, a, b) printf("%s, %d: copyout%s @%p %zu\n", __func__, \
150     __LINE__, (s), (a), (b))
151 static void dump_vmcmds(const struct exec_package * const, size_t, int);
152 #define DUMPVMCMDS(p, x, e) do { dump_vmcmds((p), (x), (e)); } while (0)
153 #else
154 #define DPRINTF(a)
155 #define COPYPRINTF(s, a, b)
156 #define DUMPVMCMDS(p, x, e) do {} while (0)
157 #endif /* DEBUG_EXEC */
158 
159 /*
160  * DTrace SDT provider definitions
161  */
162 SDT_PROVIDER_DECLARE(proc);
163 SDT_PROBE_DEFINE1(proc, kernel, , exec, "char *");
164 SDT_PROBE_DEFINE1(proc, kernel, , exec__success, "char *");
165 SDT_PROBE_DEFINE1(proc, kernel, , exec__failure, "int");
166 
167 /*
168  * Exec function switch:
169  *
170  * Note that each makecmds function is responsible for loading the
171  * exec package with the necessary functions for any exec-type-specific
172  * handling.
173  *
174  * Functions for specific exec types should be defined in their own
175  * header file.
176  */
177 static const struct execsw	**execsw = NULL;
178 static int			nexecs;
179 
180 u_int	exec_maxhdrsz;	 /* must not be static - used by netbsd32 */
181 
182 /* list of dynamically loaded execsw entries */
183 static LIST_HEAD(execlist_head, exec_entry) ex_head =
184     LIST_HEAD_INITIALIZER(ex_head);
185 struct exec_entry {
186 	LIST_ENTRY(exec_entry)	ex_list;
187 	SLIST_ENTRY(exec_entry)	ex_slist;
188 	const struct execsw	*ex_sw;
189 };
190 
191 #ifndef __HAVE_SYSCALL_INTERN
192 void	syscall(void);
193 #endif
194 
195 /* NetBSD autoloadable syscalls */
196 #ifdef MODULAR
197 #include <kern/syscalls_autoload.c>
198 #endif
199 
200 /* NetBSD emul struct */
201 struct emul emul_netbsd = {
202 	.e_name =		"netbsd",
203 #ifdef EMUL_NATIVEROOT
204 	.e_path =		EMUL_NATIVEROOT,
205 #else
206 	.e_path =		NULL,
207 #endif
208 #ifndef __HAVE_MINIMAL_EMUL
209 	.e_flags =		EMUL_HAS_SYS___syscall,
210 	.e_errno =		NULL,
211 	.e_nosys =		SYS_syscall,
212 	.e_nsysent =		SYS_NSYSENT,
213 #endif
214 #ifdef MODULAR
215 	.e_sc_autoload =	netbsd_syscalls_autoload,
216 #endif
217 	.e_sysent =		sysent,
218 	.e_nomodbits =		sysent_nomodbits,
219 #ifdef SYSCALL_DEBUG
220 	.e_syscallnames =	syscallnames,
221 #else
222 	.e_syscallnames =	NULL,
223 #endif
224 	.e_sendsig =		sendsig,
225 	.e_trapsignal =		trapsignal,
226 	.e_sigcode =		NULL,
227 	.e_esigcode =		NULL,
228 	.e_sigobject =		NULL,
229 	.e_setregs =		setregs,
230 	.e_proc_exec =		NULL,
231 	.e_proc_fork =		NULL,
232 	.e_proc_exit =		NULL,
233 	.e_lwp_fork =		NULL,
234 	.e_lwp_exit =		NULL,
235 #ifdef __HAVE_SYSCALL_INTERN
236 	.e_syscall_intern =	syscall_intern,
237 #else
238 	.e_syscall =		syscall,
239 #endif
240 	.e_sysctlovly =		NULL,
241 	.e_vm_default_addr =	uvm_default_mapaddr,
242 	.e_usertrap =		NULL,
243 	.e_ucsize =		sizeof(ucontext_t),
244 	.e_startlwp =		startlwp
245 };
246 
247 /*
248  * Exec lock. Used to control access to execsw[] structures.
249  * This must not be static so that netbsd32 can access it, too.
250  */
251 krwlock_t exec_lock __cacheline_aligned;
252 
253 static kmutex_t sigobject_lock __cacheline_aligned;
254 
255 /*
256  * Data used between a loadvm and execve part of an "exec" operation
257  */
258 struct execve_data {
259 	struct exec_package	ed_pack;
260 	struct pathbuf		*ed_pathbuf;
261 	struct vattr		ed_attr;
262 	struct ps_strings	ed_arginfo;
263 	char			*ed_argp;
264 	const char		*ed_pathstring;
265 	char			*ed_resolvedname;
266 	size_t			ed_ps_strings_sz;
267 	int			ed_szsigcode;
268 	size_t			ed_argslen;
269 	long			ed_argc;
270 	long			ed_envc;
271 };
272 
273 /*
274  * data passed from parent lwp to child during a posix_spawn()
275  */
276 struct spawn_exec_data {
277 	struct execve_data	sed_exec;
278 	struct posix_spawn_file_actions
279 				*sed_actions;
280 	struct posix_spawnattr	*sed_attrs;
281 	struct proc		*sed_parent;
282 	kcondvar_t		sed_cv_child_ready;
283 	kmutex_t		sed_mtx_child;
284 	int			sed_error;
285 	volatile uint32_t	sed_refcnt;
286 };
287 
288 static struct vm_map *exec_map;
289 static struct pool exec_pool;
290 
291 static void *
292 exec_pool_alloc(struct pool *pp, int flags)
293 {
294 
295 	return (void *)uvm_km_alloc(exec_map, NCARGS, 0,
296 	    UVM_KMF_PAGEABLE | UVM_KMF_WAITVA);
297 }
298 
299 static void
300 exec_pool_free(struct pool *pp, void *addr)
301 {
302 
303 	uvm_km_free(exec_map, (vaddr_t)addr, NCARGS, UVM_KMF_PAGEABLE);
304 }
305 
306 static struct pool_allocator exec_palloc = {
307 	.pa_alloc = exec_pool_alloc,
308 	.pa_free = exec_pool_free,
309 	.pa_pagesz = NCARGS
310 };
311 
312 static void
313 exec_path_free(struct execve_data *data)
314 {
315 	pathbuf_stringcopy_put(data->ed_pathbuf, data->ed_pathstring);
316 	pathbuf_destroy(data->ed_pathbuf);
317 	if (data->ed_resolvedname)
318 		PNBUF_PUT(data->ed_resolvedname);
319 }
320 
321 static int
322 exec_resolvename(struct lwp *l, struct exec_package *epp, struct vnode *vp,
323     char **rpath)
324 {
325 	int error;
326 	char *p;
327 
328 	KASSERT(rpath != NULL);
329 
330 	*rpath = PNBUF_GET();
331 	error = vnode_to_path(*rpath, MAXPATHLEN, vp, l, l->l_proc);
332 	if (error) {
333 		DPRINTF(("%s: can't resolve name for %s, error %d\n",
334 		    __func__, epp->ep_kname, error));
335 		PNBUF_PUT(*rpath);
336 		*rpath = NULL;
337 		return error;
338 	}
339 	epp->ep_resolvedname = *rpath;
340 	if ((p = strrchr(*rpath, '/')) != NULL)
341 		epp->ep_kname = p + 1;
342 	return 0;
343 }
344 
345 
346 /*
347  * check exec:
348  * given an "executable" described in the exec package's namei info,
349  * see what we can do with it.
350  *
351  * ON ENTRY:
352  *	exec package with appropriate namei info
353  *	lwp pointer of exec'ing lwp
354  *	NO SELF-LOCKED VNODES
355  *
356  * ON EXIT:
357  *	error:	nothing held, etc.  exec header still allocated.
358  *	ok:	filled exec package, executable's vnode (unlocked).
359  *
360  * EXEC SWITCH ENTRY:
361  * 	Locked vnode to check, exec package, proc.
362  *
363  * EXEC SWITCH EXIT:
364  *	ok:	return 0, filled exec package, executable's vnode (unlocked).
365  *	error:	destructive:
366  *			everything deallocated execept exec header.
367  *		non-destructive:
368  *			error code, executable's vnode (unlocked),
369  *			exec header unmodified.
370  */
371 int
372 /*ARGSUSED*/
373 check_exec(struct lwp *l, struct exec_package *epp, struct pathbuf *pb,
374     char **rpath)
375 {
376 	int		error, i;
377 	struct vnode	*vp;
378 	size_t		resid;
379 
380 	if (epp->ep_resolvedname) {
381 		struct nameidata nd;
382 
383 		// grab the absolute pathbuf here before namei() trashes it.
384 		pathbuf_copystring(pb, epp->ep_resolvedname, PATH_MAX);
385 		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
386 
387 		/* first get the vnode */
388 		if ((error = namei(&nd)) != 0)
389 			return error;
390 
391 		epp->ep_vp = vp = nd.ni_vp;
392 #ifdef DIAGNOSTIC
393 		/* paranoia (take this out once namei stuff stabilizes) */
394 		memset(nd.ni_pnbuf, '~', PATH_MAX);
395 #endif
396 	} else {
397 		struct file *fp;
398 
399 		if ((error = fd_getvnode(epp->ep_xfd, &fp)) != 0)
400 			return error;
401 		epp->ep_vp = vp = fp->f_vnode;
402 		vref(vp);
403 		fd_putfile(epp->ep_xfd);
404 		if ((error = exec_resolvename(l, epp, vp, rpath)) != 0)
405 			return error;
406 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
407 	}
408 
409 	/* check access and type */
410 	if (vp->v_type != VREG) {
411 		error = EACCES;
412 		goto bad1;
413 	}
414 	if ((error = VOP_ACCESS(vp, VEXEC, l->l_cred)) != 0)
415 		goto bad1;
416 
417 	/* get attributes */
418 	/* XXX VOP_GETATTR is the only thing that needs LK_EXCLUSIVE here */
419 	if ((error = VOP_GETATTR(vp, epp->ep_vap, l->l_cred)) != 0)
420 		goto bad1;
421 
422 	/* Check mount point */
423 	if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
424 		error = EACCES;
425 		goto bad1;
426 	}
427 	if (vp->v_mount->mnt_flag & MNT_NOSUID)
428 		epp->ep_vap->va_mode &= ~(S_ISUID | S_ISGID);
429 
430 	/* try to open it */
431 	if ((error = VOP_OPEN(vp, FREAD, l->l_cred)) != 0)
432 		goto bad1;
433 
434 	/* now we have the file, get the exec header */
435 	error = vn_rdwr(UIO_READ, vp, epp->ep_hdr, epp->ep_hdrlen, 0,
436 			UIO_SYSSPACE, IO_NODELOCKED, l->l_cred, &resid, NULL);
437 	if (error)
438 		goto bad1;
439 
440 	/* unlock vp, since we need it unlocked from here on out. */
441 	VOP_UNLOCK(vp);
442 
443 #if NVERIEXEC > 0
444 	error = veriexec_verify(l, vp,
445 	    epp->ep_resolvedname ? epp->ep_resolvedname : epp->ep_kname,
446 	    epp->ep_flags & EXEC_INDIR ? VERIEXEC_INDIRECT : VERIEXEC_DIRECT,
447 	    NULL);
448 	if (error)
449 		goto bad2;
450 #endif /* NVERIEXEC > 0 */
451 
452 #ifdef PAX_SEGVGUARD
453 	error = pax_segvguard(l, vp, epp->ep_resolvedname, false);
454 	if (error)
455 		goto bad2;
456 #endif /* PAX_SEGVGUARD */
457 
458 	epp->ep_hdrvalid = epp->ep_hdrlen - resid;
459 
460 	/*
461 	 * Set up default address space limits.  Can be overridden
462 	 * by individual exec packages.
463 	 */
464 	epp->ep_vm_minaddr = exec_vm_minaddr(VM_MIN_ADDRESS);
465 	epp->ep_vm_maxaddr = VM_MAXUSER_ADDRESS;
466 
467 	/*
468 	 * set up the vmcmds for creation of the process
469 	 * address space
470 	 */
471 	error = ENOEXEC;
472 	for (i = 0; i < nexecs; i++) {
473 		int newerror;
474 
475 		epp->ep_esch = execsw[i];
476 		newerror = (*execsw[i]->es_makecmds)(l, epp);
477 
478 		if (!newerror) {
479 			/* Seems ok: check that entry point is not too high */
480 			if (epp->ep_entry >= epp->ep_vm_maxaddr) {
481 #ifdef DIAGNOSTIC
482 				printf("%s: rejecting %p due to "
483 				    "too high entry address (>= %p)\n",
484 					 __func__, (void *)epp->ep_entry,
485 					 (void *)epp->ep_vm_maxaddr);
486 #endif
487 				error = ENOEXEC;
488 				break;
489 			}
490 			/* Seems ok: check that entry point is not too low */
491 			if (epp->ep_entry < epp->ep_vm_minaddr) {
492 #ifdef DIAGNOSTIC
493 				printf("%s: rejecting %p due to "
494 				    "too low entry address (< %p)\n",
495 				     __func__, (void *)epp->ep_entry,
496 				     (void *)epp->ep_vm_minaddr);
497 #endif
498 				error = ENOEXEC;
499 				break;
500 			}
501 
502 			/* check limits */
503 #ifdef DIAGNOSTIC
504 #define LMSG "%s: rejecting due to %s limit (%ju > %ju)\n"
505 #endif
506 #ifdef MAXTSIZ
507 			if (epp->ep_tsize > MAXTSIZ) {
508 #ifdef DIAGNOSTIC
509 				printf(LMSG, __func__, "text",
510 				    (uintmax_t)epp->ep_tsize,
511 				    (uintmax_t)MAXTSIZ);
512 #endif
513 				error = ENOMEM;
514 				break;
515 			}
516 #endif
517 			vsize_t dlimit =
518 			    (vsize_t)l->l_proc->p_rlimit[RLIMIT_DATA].rlim_cur;
519 			if (epp->ep_dsize > dlimit) {
520 #ifdef DIAGNOSTIC
521 				printf(LMSG, __func__, "data",
522 				    (uintmax_t)epp->ep_dsize,
523 				    (uintmax_t)dlimit);
524 #endif
525 				error = ENOMEM;
526 				break;
527 			}
528 			return 0;
529 		}
530 
531 		/*
532 		 * Reset all the fields that may have been modified by the
533 		 * loader.
534 		 */
535 		KASSERT(epp->ep_emul_arg == NULL);
536 		if (epp->ep_emul_root != NULL) {
537 			vrele(epp->ep_emul_root);
538 			epp->ep_emul_root = NULL;
539 		}
540 		if (epp->ep_interp != NULL) {
541 			vrele(epp->ep_interp);
542 			epp->ep_interp = NULL;
543 		}
544 		epp->ep_pax_flags = 0;
545 
546 		/* make sure the first "interesting" error code is saved. */
547 		if (error == ENOEXEC)
548 			error = newerror;
549 
550 		if (epp->ep_flags & EXEC_DESTR)
551 			/* Error from "#!" code, tidied up by recursive call */
552 			return error;
553 	}
554 
555 	/* not found, error */
556 
557 	/*
558 	 * free any vmspace-creation commands,
559 	 * and release their references
560 	 */
561 	kill_vmcmds(&epp->ep_vmcmds);
562 
563 #if NVERIEXEC > 0 || defined(PAX_SEGVGUARD)
564 bad2:
565 #endif
566 	/*
567 	 * close and release the vnode, restore the old one, free the
568 	 * pathname buf, and punt.
569 	 */
570 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
571 	VOP_CLOSE(vp, FREAD, l->l_cred);
572 	vput(vp);
573 	return error;
574 
575 bad1:
576 	/*
577 	 * free the namei pathname buffer, and put the vnode
578 	 * (which we don't yet have open).
579 	 */
580 	vput(vp);				/* was still locked */
581 	return error;
582 }
583 
584 #ifdef __MACHINE_STACK_GROWS_UP
585 #define STACK_PTHREADSPACE NBPG
586 #else
587 #define STACK_PTHREADSPACE 0
588 #endif
589 
590 static int
591 execve_fetch_element(char * const *array, size_t index, char **value)
592 {
593 	return copyin(array + index, value, sizeof(*value));
594 }
595 
596 /*
597  * exec system call
598  */
599 int
600 sys_execve(struct lwp *l, const struct sys_execve_args *uap, register_t *retval)
601 {
602 	/* {
603 		syscallarg(const char *)	path;
604 		syscallarg(char * const *)	argp;
605 		syscallarg(char * const *)	envp;
606 	} */
607 
608 	return execve1(l, true, SCARG(uap, path), -1, SCARG(uap, argp),
609 	    SCARG(uap, envp), execve_fetch_element);
610 }
611 
612 int
613 sys_fexecve(struct lwp *l, const struct sys_fexecve_args *uap,
614     register_t *retval)
615 {
616 	/* {
617 		syscallarg(int)			fd;
618 		syscallarg(char * const *)	argp;
619 		syscallarg(char * const *)	envp;
620 	} */
621 
622 	return execve1(l, false, NULL, SCARG(uap, fd), SCARG(uap, argp),
623 	    SCARG(uap, envp), execve_fetch_element);
624 }
625 
626 /*
627  * Load modules to try and execute an image that we do not understand.
628  * If no execsw entries are present, we load those likely to be needed
629  * in order to run native images only.  Otherwise, we autoload all
630  * possible modules that could let us run the binary.  XXX lame
631  */
632 static void
633 exec_autoload(void)
634 {
635 #ifdef MODULAR
636 	static const char * const native[] = {
637 		"exec_elf32",
638 		"exec_elf64",
639 		"exec_script",
640 		NULL
641 	};
642 	static const char * const compat[] = {
643 		"exec_elf32",
644 		"exec_elf64",
645 		"exec_script",
646 		"exec_aout",
647 		"exec_coff",
648 		"exec_ecoff",
649 		"compat_aoutm68k",
650 		"compat_netbsd32",
651 #if 0
652 		"compat_linux",
653 		"compat_linux32",
654 #endif
655 		"compat_sunos",
656 		"compat_sunos32",
657 		"compat_ultrix",
658 		NULL
659 	};
660 	char const * const *list;
661 	int i;
662 
663 	list = nexecs == 0 ? native : compat;
664 	for (i = 0; list[i] != NULL; i++) {
665 		if (module_autoload(list[i], MODULE_CLASS_EXEC) != 0) {
666 			continue;
667 		}
668 		yield();
669 	}
670 #endif
671 }
672 
673 /*
674  * Copy the user or kernel supplied upath to the allocated pathbuffer pbp
675  * making it absolute in the process, by prepending the current working
676  * directory if it is not. If offs is supplied it will contain the offset
677  * where the original supplied copy of upath starts.
678  */
679 int
680 exec_makepathbuf(struct lwp *l, const char *upath, enum uio_seg seg,
681     struct pathbuf **pbp, size_t *offs)
682 {
683 	char *path, *bp;
684 	size_t len, tlen;
685 	int error;
686 	struct cwdinfo *cwdi;
687 
688 	path = PNBUF_GET();
689 	if (seg == UIO_SYSSPACE) {
690 		error = copystr(upath, path, MAXPATHLEN, &len);
691 	} else {
692 		error = copyinstr(upath, path, MAXPATHLEN, &len);
693 	}
694 	if (error)
695 		goto err;
696 
697 	if (path[0] == '/') {
698 		if (offs)
699 			*offs = 0;
700 		goto out;
701 	}
702 
703 	len++;
704 	if (len + 1 >= MAXPATHLEN) {
705 		error = ENAMETOOLONG;
706 		goto err;
707 	}
708 	bp = path + MAXPATHLEN - len;
709 	memmove(bp, path, len);
710 	*(--bp) = '/';
711 
712 	cwdi = l->l_proc->p_cwdi;
713 	rw_enter(&cwdi->cwdi_lock, RW_READER);
714 	error = getcwd_common(cwdi->cwdi_cdir, NULL, &bp, path, MAXPATHLEN / 2,
715 	    GETCWD_CHECK_ACCESS, l);
716 	rw_exit(&cwdi->cwdi_lock);
717 
718 	if (error)
719 		goto err;
720 	tlen = path + MAXPATHLEN - bp;
721 
722 	memmove(path, bp, tlen);
723 	path[tlen - 1] = '\0';
724 	if (offs)
725 		*offs = tlen - len;
726 out:
727 	*pbp = pathbuf_assimilate(path);
728 	return 0;
729 err:
730 	PNBUF_PUT(path);
731 	return error;
732 }
733 
734 vaddr_t
735 exec_vm_minaddr(vaddr_t va_min)
736 {
737 	/*
738 	 * Increase va_min if we don't want NULL to be mappable by the
739 	 * process.
740 	 */
741 #define VM_MIN_GUARD	PAGE_SIZE
742 	if (user_va0_disable && (va_min < VM_MIN_GUARD))
743 		return VM_MIN_GUARD;
744 	return va_min;
745 }
746 
747 static int
748 execve_loadvm(struct lwp *l, bool has_path, const char *path, int fd,
749 	char * const *args, char * const *envs,
750 	execve_fetch_element_t fetch_element,
751 	struct execve_data * restrict data)
752 {
753 	struct exec_package	* const epp = &data->ed_pack;
754 	int			error;
755 	struct proc		*p;
756 	char			*dp;
757 	u_int			modgen;
758 
759 	KASSERT(data != NULL);
760 
761 	p = l->l_proc;
762 	modgen = 0;
763 
764 	SDT_PROBE(proc, kernel, , exec, path, 0, 0, 0, 0);
765 
766 	/*
767 	 * Check if we have exceeded our number of processes limit.
768 	 * This is so that we handle the case where a root daemon
769 	 * forked, ran setuid to become the desired user and is trying
770 	 * to exec. The obvious place to do the reference counting check
771 	 * is setuid(), but we don't do the reference counting check there
772 	 * like other OS's do because then all the programs that use setuid()
773 	 * must be modified to check the return code of setuid() and exit().
774 	 * It is dangerous to make setuid() fail, because it fails open and
775 	 * the program will continue to run as root. If we make it succeed
776 	 * and return an error code, again we are not enforcing the limit.
777 	 * The best place to enforce the limit is here, when the process tries
778 	 * to execute a new image, because eventually the process will need
779 	 * to call exec in order to do something useful.
780 	 */
781  retry:
782 	if (p->p_flag & PK_SUGID) {
783 		if (kauth_authorize_process(l->l_cred, KAUTH_PROCESS_RLIMIT,
784 		     p, KAUTH_ARG(KAUTH_REQ_PROCESS_RLIMIT_BYPASS),
785 		     &p->p_rlimit[RLIMIT_NPROC],
786 		     KAUTH_ARG(RLIMIT_NPROC)) != 0 &&
787 		    chgproccnt(kauth_cred_getuid(l->l_cred), 0) >
788 		     p->p_rlimit[RLIMIT_NPROC].rlim_cur)
789 		return EAGAIN;
790 	}
791 
792 	/*
793 	 * Drain existing references and forbid new ones.  The process
794 	 * should be left alone until we're done here.  This is necessary
795 	 * to avoid race conditions - e.g. in ptrace() - that might allow
796 	 * a local user to illicitly obtain elevated privileges.
797 	 */
798 	rw_enter(&p->p_reflock, RW_WRITER);
799 
800 	if (has_path) {
801 		size_t	offs;
802 		/*
803 		 * Init the namei data to point the file user's program name.
804 		 * This is done here rather than in check_exec(), so that it's
805 		 * possible to override this settings if any of makecmd/probe
806 		 * functions call check_exec() recursively - for example,
807 		 * see exec_script_makecmds().
808 		 */
809 		if ((error = exec_makepathbuf(l, path, UIO_USERSPACE,
810 		    &data->ed_pathbuf, &offs)) != 0)
811 			goto clrflg;
812 		data->ed_pathstring = pathbuf_stringcopy_get(data->ed_pathbuf);
813 		epp->ep_kname = data->ed_pathstring + offs;
814 		data->ed_resolvedname = PNBUF_GET();
815 		epp->ep_resolvedname = data->ed_resolvedname;
816 		epp->ep_xfd = -1;
817 	} else {
818 		data->ed_pathbuf = pathbuf_assimilate(strcpy(PNBUF_GET(), "/"));
819 		data->ed_pathstring = pathbuf_stringcopy_get(data->ed_pathbuf);
820 		epp->ep_kname = "*fexecve*";
821 		data->ed_resolvedname = NULL;
822 		epp->ep_resolvedname = NULL;
823 		epp->ep_xfd = fd;
824 	}
825 
826 
827 	/*
828 	 * initialize the fields of the exec package.
829 	 */
830 	epp->ep_hdr = kmem_alloc(exec_maxhdrsz, KM_SLEEP);
831 	epp->ep_hdrlen = exec_maxhdrsz;
832 	epp->ep_hdrvalid = 0;
833 	epp->ep_emul_arg = NULL;
834 	epp->ep_emul_arg_free = NULL;
835 	memset(&epp->ep_vmcmds, 0, sizeof(epp->ep_vmcmds));
836 	epp->ep_vap = &data->ed_attr;
837 	epp->ep_flags = (p->p_flag & PK_32) ? EXEC_FROM32 : 0;
838 	MD_TOPDOWN_INIT(epp);
839 	epp->ep_emul_root = NULL;
840 	epp->ep_interp = NULL;
841 	epp->ep_esch = NULL;
842 	epp->ep_pax_flags = 0;
843 	memset(epp->ep_machine_arch, 0, sizeof(epp->ep_machine_arch));
844 
845 	rw_enter(&exec_lock, RW_READER);
846 
847 	/* see if we can run it. */
848 	if ((error = check_exec(l, epp, data->ed_pathbuf,
849 	    &data->ed_resolvedname)) != 0) {
850 		if (error != ENOENT && error != EACCES && error != ENOEXEC) {
851 			DPRINTF(("%s: check exec failed for %s, error %d\n",
852 			    __func__, epp->ep_kname, error));
853 		}
854 		goto freehdr;
855 	}
856 
857 	/* allocate an argument buffer */
858 	data->ed_argp = pool_get(&exec_pool, PR_WAITOK);
859 	KASSERT(data->ed_argp != NULL);
860 	dp = data->ed_argp;
861 
862 	if ((error = copyinargs(data, args, envs, fetch_element, &dp)) != 0) {
863 		goto bad;
864 	}
865 
866 	/*
867 	 * Calculate the new stack size.
868 	 */
869 
870 #ifdef __MACHINE_STACK_GROWS_UP
871 /*
872  * copyargs() fills argc/argv/envp from the lower address even on
873  * __MACHINE_STACK_GROWS_UP machines.  Reserve a few words just below the SP
874  * so that _rtld() use it.
875  */
876 #define	RTLD_GAP	32
877 #else
878 #define	RTLD_GAP	0
879 #endif
880 
881 	const size_t argenvstrlen = (char *)ALIGN(dp) - data->ed_argp;
882 
883 	data->ed_argslen = calcargs(data, argenvstrlen);
884 
885 	const size_t len = calcstack(data, pax_aslr_stack_gap(epp) + RTLD_GAP);
886 
887 	if (len > epp->ep_ssize) {
888 		/* in effect, compare to initial limit */
889 		DPRINTF(("%s: stack limit exceeded %zu\n", __func__, len));
890 		error = ENOMEM;
891 		goto bad;
892 	}
893 	/* adjust "active stack depth" for process VSZ */
894 	epp->ep_ssize = len;
895 
896 	return 0;
897 
898  bad:
899 	/* free the vmspace-creation commands, and release their references */
900 	kill_vmcmds(&epp->ep_vmcmds);
901 	/* kill any opened file descriptor, if necessary */
902 	if (epp->ep_flags & EXEC_HASFD) {
903 		epp->ep_flags &= ~EXEC_HASFD;
904 		fd_close(epp->ep_fd);
905 	}
906 	/* close and put the exec'd file */
907 	vn_lock(epp->ep_vp, LK_EXCLUSIVE | LK_RETRY);
908 	VOP_CLOSE(epp->ep_vp, FREAD, l->l_cred);
909 	vput(epp->ep_vp);
910 	pool_put(&exec_pool, data->ed_argp);
911 
912  freehdr:
913 	kmem_free(epp->ep_hdr, epp->ep_hdrlen);
914 	if (epp->ep_emul_root != NULL)
915 		vrele(epp->ep_emul_root);
916 	if (epp->ep_interp != NULL)
917 		vrele(epp->ep_interp);
918 
919 	rw_exit(&exec_lock);
920 
921 	exec_path_free(data);
922 
923  clrflg:
924 	rw_exit(&p->p_reflock);
925 
926 	if (modgen != module_gen && error == ENOEXEC) {
927 		modgen = module_gen;
928 		exec_autoload();
929 		goto retry;
930 	}
931 
932 	SDT_PROBE(proc, kernel, , exec__failure, error, 0, 0, 0, 0);
933 	return error;
934 }
935 
936 static int
937 execve_dovmcmds(struct lwp *l, struct execve_data * restrict data)
938 {
939 	struct exec_package	* const epp = &data->ed_pack;
940 	struct proc		*p = l->l_proc;
941 	struct exec_vmcmd	*base_vcp;
942 	int			error = 0;
943 	size_t			i;
944 
945 	/* record proc's vnode, for use by procfs and others */
946 	if (p->p_textvp)
947 		vrele(p->p_textvp);
948 	vref(epp->ep_vp);
949 	p->p_textvp = epp->ep_vp;
950 
951 	/* create the new process's VM space by running the vmcmds */
952 	KASSERTMSG(epp->ep_vmcmds.evs_used != 0, "%s: no vmcmds", __func__);
953 
954 #ifdef TRACE_EXEC
955 	DUMPVMCMDS(epp, 0, 0);
956 #endif
957 
958 	base_vcp = NULL;
959 
960 	for (i = 0; i < epp->ep_vmcmds.evs_used && !error; i++) {
961 		struct exec_vmcmd *vcp;
962 
963 		vcp = &epp->ep_vmcmds.evs_cmds[i];
964 		if (vcp->ev_flags & VMCMD_RELATIVE) {
965 			KASSERTMSG(base_vcp != NULL,
966 			    "%s: relative vmcmd with no base", __func__);
967 			KASSERTMSG((vcp->ev_flags & VMCMD_BASE) == 0,
968 			    "%s: illegal base & relative vmcmd", __func__);
969 			vcp->ev_addr += base_vcp->ev_addr;
970 		}
971 		error = (*vcp->ev_proc)(l, vcp);
972 		if (error)
973 			DUMPVMCMDS(epp, i, error);
974 		if (vcp->ev_flags & VMCMD_BASE)
975 			base_vcp = vcp;
976 	}
977 
978 	/* free the vmspace-creation commands, and release their references */
979 	kill_vmcmds(&epp->ep_vmcmds);
980 
981 	vn_lock(epp->ep_vp, LK_EXCLUSIVE | LK_RETRY);
982 	VOP_CLOSE(epp->ep_vp, FREAD, l->l_cred);
983 	vput(epp->ep_vp);
984 
985 	/* if an error happened, deallocate and punt */
986 	if (error != 0) {
987 		DPRINTF(("%s: vmcmd %zu failed: %d\n", __func__, i - 1, error));
988 	}
989 	return error;
990 }
991 
992 static void
993 execve_free_data(struct execve_data *data)
994 {
995 	struct exec_package	* const epp = &data->ed_pack;
996 
997 	/* free the vmspace-creation commands, and release their references */
998 	kill_vmcmds(&epp->ep_vmcmds);
999 	/* kill any opened file descriptor, if necessary */
1000 	if (epp->ep_flags & EXEC_HASFD) {
1001 		epp->ep_flags &= ~EXEC_HASFD;
1002 		fd_close(epp->ep_fd);
1003 	}
1004 
1005 	/* close and put the exec'd file */
1006 	vn_lock(epp->ep_vp, LK_EXCLUSIVE | LK_RETRY);
1007 	VOP_CLOSE(epp->ep_vp, FREAD, curlwp->l_cred);
1008 	vput(epp->ep_vp);
1009 	pool_put(&exec_pool, data->ed_argp);
1010 
1011 	kmem_free(epp->ep_hdr, epp->ep_hdrlen);
1012 	if (epp->ep_emul_root != NULL)
1013 		vrele(epp->ep_emul_root);
1014 	if (epp->ep_interp != NULL)
1015 		vrele(epp->ep_interp);
1016 
1017 	exec_path_free(data);
1018 }
1019 
1020 static void
1021 pathexec(struct proc *p, const char *resolvedname)
1022 {
1023 	/* set command name & other accounting info */
1024 	const char *cmdname;
1025 
1026 	if (resolvedname == NULL) {
1027 		cmdname = "*fexecve*";
1028 		resolvedname = "/";
1029 	} else {
1030 		cmdname = strrchr(resolvedname, '/') + 1;
1031 	}
1032 	KASSERTMSG(resolvedname[0] == '/', "bad resolvedname `%s'",
1033 	    resolvedname);
1034 
1035 	strlcpy(p->p_comm, cmdname, sizeof(p->p_comm));
1036 
1037 	kmem_strfree(p->p_path);
1038 	p->p_path = kmem_strdupsize(resolvedname, NULL, KM_SLEEP);
1039 }
1040 
1041 /* XXX elsewhere */
1042 static int
1043 credexec(struct lwp *l, struct vattr *attr)
1044 {
1045 	struct proc *p = l->l_proc;
1046 	int error;
1047 
1048 	/*
1049 	 * Deal with set[ug]id.  MNT_NOSUID has already been used to disable
1050 	 * s[ug]id.  It's OK to check for PSL_TRACED here as we have blocked
1051 	 * out additional references on the process for the moment.
1052 	 */
1053 	if ((p->p_slflag & PSL_TRACED) == 0 &&
1054 
1055 	    (((attr->va_mode & S_ISUID) != 0 &&
1056 	      kauth_cred_geteuid(l->l_cred) != attr->va_uid) ||
1057 
1058 	     ((attr->va_mode & S_ISGID) != 0 &&
1059 	      kauth_cred_getegid(l->l_cred) != attr->va_gid))) {
1060 		/*
1061 		 * Mark the process as SUGID before we do
1062 		 * anything that might block.
1063 		 */
1064 		proc_crmod_enter();
1065 		proc_crmod_leave(NULL, NULL, true);
1066 
1067 		/* Make sure file descriptors 0..2 are in use. */
1068 		if ((error = fd_checkstd()) != 0) {
1069 			DPRINTF(("%s: fdcheckstd failed %d\n",
1070 			    __func__, error));
1071 			return error;
1072 		}
1073 
1074 		/*
1075 		 * Copy the credential so other references don't see our
1076 		 * changes.
1077 		 */
1078 		l->l_cred = kauth_cred_copy(l->l_cred);
1079 #ifdef KTRACE
1080 		/*
1081 		 * If the persistent trace flag isn't set, turn off.
1082 		 */
1083 		if (p->p_tracep) {
1084 			mutex_enter(&ktrace_lock);
1085 			if (!(p->p_traceflag & KTRFAC_PERSISTENT))
1086 				ktrderef(p);
1087 			mutex_exit(&ktrace_lock);
1088 		}
1089 #endif
1090 		if (attr->va_mode & S_ISUID)
1091 			kauth_cred_seteuid(l->l_cred, attr->va_uid);
1092 		if (attr->va_mode & S_ISGID)
1093 			kauth_cred_setegid(l->l_cred, attr->va_gid);
1094 	} else {
1095 		if (kauth_cred_geteuid(l->l_cred) ==
1096 		    kauth_cred_getuid(l->l_cred) &&
1097 		    kauth_cred_getegid(l->l_cred) ==
1098 		    kauth_cred_getgid(l->l_cred))
1099 			p->p_flag &= ~PK_SUGID;
1100 	}
1101 
1102 	/*
1103 	 * Copy the credential so other references don't see our changes.
1104 	 * Test to see if this is necessary first, since in the common case
1105 	 * we won't need a private reference.
1106 	 */
1107 	if (kauth_cred_geteuid(l->l_cred) != kauth_cred_getsvuid(l->l_cred) ||
1108 	    kauth_cred_getegid(l->l_cred) != kauth_cred_getsvgid(l->l_cred)) {
1109 		l->l_cred = kauth_cred_copy(l->l_cred);
1110 		kauth_cred_setsvuid(l->l_cred, kauth_cred_geteuid(l->l_cred));
1111 		kauth_cred_setsvgid(l->l_cred, kauth_cred_getegid(l->l_cred));
1112 	}
1113 
1114 	/* Update the master credentials. */
1115 	if (l->l_cred != p->p_cred) {
1116 		kauth_cred_t ocred;
1117 
1118 		kauth_cred_hold(l->l_cred);
1119 		mutex_enter(p->p_lock);
1120 		ocred = p->p_cred;
1121 		p->p_cred = l->l_cred;
1122 		mutex_exit(p->p_lock);
1123 		kauth_cred_free(ocred);
1124 	}
1125 
1126 	return 0;
1127 }
1128 
1129 static void
1130 emulexec(struct lwp *l, struct exec_package *epp)
1131 {
1132 	struct proc		*p = l->l_proc;
1133 
1134 	/* The emulation root will usually have been found when we looked
1135 	 * for the elf interpreter (or similar), if not look now. */
1136 	if (epp->ep_esch->es_emul->e_path != NULL &&
1137 	    epp->ep_emul_root == NULL)
1138 		emul_find_root(l, epp);
1139 
1140 	/* Any old emulation root got removed by fdcloseexec */
1141 	rw_enter(&p->p_cwdi->cwdi_lock, RW_WRITER);
1142 	p->p_cwdi->cwdi_edir = epp->ep_emul_root;
1143 	rw_exit(&p->p_cwdi->cwdi_lock);
1144 	epp->ep_emul_root = NULL;
1145 	if (epp->ep_interp != NULL)
1146 		vrele(epp->ep_interp);
1147 
1148 	/*
1149 	 * Call emulation specific exec hook. This can setup per-process
1150 	 * p->p_emuldata or do any other per-process stuff an emulation needs.
1151 	 *
1152 	 * If we are executing process of different emulation than the
1153 	 * original forked process, call e_proc_exit() of the old emulation
1154 	 * first, then e_proc_exec() of new emulation. If the emulation is
1155 	 * same, the exec hook code should deallocate any old emulation
1156 	 * resources held previously by this process.
1157 	 */
1158 	if (p->p_emul && p->p_emul->e_proc_exit
1159 	    && p->p_emul != epp->ep_esch->es_emul)
1160 		(*p->p_emul->e_proc_exit)(p);
1161 
1162 	/*
1163 	 * Call exec hook. Emulation code may NOT store reference to anything
1164 	 * from &pack.
1165 	 */
1166 	if (epp->ep_esch->es_emul->e_proc_exec)
1167 		(*epp->ep_esch->es_emul->e_proc_exec)(p, epp);
1168 
1169 	/* update p_emul, the old value is no longer needed */
1170 	p->p_emul = epp->ep_esch->es_emul;
1171 
1172 	/* ...and the same for p_execsw */
1173 	p->p_execsw = epp->ep_esch;
1174 
1175 #ifdef __HAVE_SYSCALL_INTERN
1176 	(*p->p_emul->e_syscall_intern)(p);
1177 #endif
1178 	ktremul();
1179 }
1180 
1181 static int
1182 execve_runproc(struct lwp *l, struct execve_data * restrict data,
1183 	bool no_local_exec_lock, bool is_spawn)
1184 {
1185 	struct exec_package	* const epp = &data->ed_pack;
1186 	int error = 0;
1187 	struct proc		*p;
1188 	struct vmspace		*vm;
1189 
1190 	/*
1191 	 * In case of a posix_spawn operation, the child doing the exec
1192 	 * might not hold the reader lock on exec_lock, but the parent
1193 	 * will do this instead.
1194 	 */
1195 	KASSERT(no_local_exec_lock || rw_lock_held(&exec_lock));
1196 	KASSERT(!no_local_exec_lock || is_spawn);
1197 	KASSERT(data != NULL);
1198 
1199 	p = l->l_proc;
1200 
1201 	/* Get rid of other LWPs. */
1202 	if (p->p_nlwps > 1) {
1203 		mutex_enter(p->p_lock);
1204 		exit_lwps(l);
1205 		mutex_exit(p->p_lock);
1206 	}
1207 	KDASSERT(p->p_nlwps == 1);
1208 
1209 	/*
1210 	 * All of the other LWPs got rid of their robust futexes
1211 	 * when they exited above, but we might still have some
1212 	 * to dispose of.  Do that now.
1213 	 */
1214 	if (__predict_false(l->l_robust_head != 0)) {
1215 		futex_release_all_lwp(l);
1216 		/*
1217 		 * Since this LWP will live on with a different
1218 		 * program image, we need to clear the robust
1219 		 * futex list pointer here.
1220 		 */
1221 		l->l_robust_head = 0;
1222 	}
1223 
1224 	/* Destroy any lwpctl info. */
1225 	if (p->p_lwpctl != NULL)
1226 		lwp_ctl_exit();
1227 
1228 	/* Remove POSIX timers */
1229 	ptimers_free(p, TIMERS_POSIX);
1230 
1231 	/* Set the PaX flags. */
1232 	pax_set_flags(epp, p);
1233 
1234 	/*
1235 	 * Do whatever is necessary to prepare the address space
1236 	 * for remapping.  Note that this might replace the current
1237 	 * vmspace with another!
1238 	 *
1239 	 * vfork(): do not touch any user space data in the new child
1240 	 * until we have awoken the parent below, or it will defeat
1241 	 * lazy pmap switching (on x86).
1242 	 */
1243 	if (is_spawn)
1244 		uvmspace_spawn(l, epp->ep_vm_minaddr,
1245 		    epp->ep_vm_maxaddr,
1246 		    epp->ep_flags & EXEC_TOPDOWN_VM);
1247 	else
1248 		uvmspace_exec(l, epp->ep_vm_minaddr,
1249 		    epp->ep_vm_maxaddr,
1250 		    epp->ep_flags & EXEC_TOPDOWN_VM);
1251 	vm = p->p_vmspace;
1252 
1253 	vm->vm_taddr = (void *)epp->ep_taddr;
1254 	vm->vm_tsize = btoc(epp->ep_tsize);
1255 	vm->vm_daddr = (void*)epp->ep_daddr;
1256 	vm->vm_dsize = btoc(epp->ep_dsize);
1257 	vm->vm_ssize = btoc(epp->ep_ssize);
1258 	vm->vm_issize = 0;
1259 	vm->vm_maxsaddr = (void *)epp->ep_maxsaddr;
1260 	vm->vm_minsaddr = (void *)epp->ep_minsaddr;
1261 
1262 	pax_aslr_init_vm(l, vm, epp);
1263 
1264 	cwdexec(p);
1265 	fd_closeexec();		/* handle close on exec */
1266 
1267 	if (__predict_false(ktrace_on))
1268 		fd_ktrexecfd();
1269 
1270 	execsigs(p);		/* reset caught signals */
1271 
1272 	mutex_enter(p->p_lock);
1273 	l->l_ctxlink = NULL;	/* reset ucontext link */
1274 	p->p_acflag &= ~AFORK;
1275 	p->p_flag |= PK_EXEC;
1276 	mutex_exit(p->p_lock);
1277 
1278 	error = credexec(l, &data->ed_attr);
1279 	if (error)
1280 		goto exec_abort;
1281 
1282 #if defined(__HAVE_RAS)
1283 	/*
1284 	 * Remove all RASs from the address space.
1285 	 */
1286 	ras_purgeall();
1287 #endif
1288 
1289 	/*
1290 	 * Stop profiling.
1291 	 */
1292 	if ((p->p_stflag & PST_PROFIL) != 0) {
1293 		mutex_spin_enter(&p->p_stmutex);
1294 		stopprofclock(p);
1295 		mutex_spin_exit(&p->p_stmutex);
1296 	}
1297 
1298 	/*
1299 	 * It's OK to test PL_PPWAIT unlocked here, as other LWPs have
1300 	 * exited and exec()/exit() are the only places it will be cleared.
1301 	 *
1302 	 * Once the parent has been awoken, curlwp may teleport to a new CPU
1303 	 * in sched_vforkexec(), and it's then OK to start messing with user
1304 	 * data.  See comment above.
1305 	 */
1306 	if ((p->p_lflag & PL_PPWAIT) != 0) {
1307 		bool samecpu;
1308 		lwp_t *lp;
1309 
1310 		mutex_enter(&proc_lock);
1311 		lp = p->p_vforklwp;
1312 		p->p_vforklwp = NULL;
1313 		l->l_lwpctl = NULL; /* was on loan from blocked parent */
1314 		cv_broadcast(&lp->l_waitcv);
1315 
1316 		/* Clear flags after cv_broadcast() (scheduler needs them). */
1317 		p->p_lflag &= ~PL_PPWAIT;
1318 		lp->l_vforkwaiting = false;
1319 
1320 		/* If parent is still on same CPU, teleport curlwp elsewhere. */
1321 		samecpu = (lp->l_cpu == curlwp->l_cpu);
1322 		mutex_exit(&proc_lock);
1323 
1324 		/* Give the parent its CPU back - find a new home. */
1325 		KASSERT(!is_spawn);
1326 		sched_vforkexec(l, samecpu);
1327 	}
1328 
1329 	/* Now map address space. */
1330 	error = execve_dovmcmds(l, data);
1331 	if (error != 0)
1332 		goto exec_abort;
1333 
1334 	pathexec(p, epp->ep_resolvedname);
1335 
1336 	char * const newstack = STACK_GROW(vm->vm_minsaddr, epp->ep_ssize);
1337 
1338 	error = copyoutargs(data, l, newstack);
1339 	if (error != 0)
1340 		goto exec_abort;
1341 
1342 	doexechooks(p);
1343 
1344 	/*
1345 	 * Set initial SP at the top of the stack.
1346 	 *
1347 	 * Note that on machines where stack grows up (e.g. hppa), SP points to
1348 	 * the end of arg/env strings.  Userland guesses the address of argc
1349 	 * via ps_strings::ps_argvstr.
1350 	 */
1351 
1352 	/* Setup new registers and do misc. setup. */
1353 	(*epp->ep_esch->es_emul->e_setregs)(l, epp, (vaddr_t)newstack);
1354 	if (epp->ep_esch->es_setregs)
1355 		(*epp->ep_esch->es_setregs)(l, epp, (vaddr_t)newstack);
1356 
1357 	/* Provide a consistent LWP private setting */
1358 	(void)lwp_setprivate(l, NULL);
1359 
1360 	/* Discard all PCU state; need to start fresh */
1361 	pcu_discard_all(l);
1362 
1363 	/* map the process's signal trampoline code */
1364 	if ((error = exec_sigcode_map(p, epp->ep_esch->es_emul)) != 0) {
1365 		DPRINTF(("%s: map sigcode failed %d\n", __func__, error));
1366 		goto exec_abort;
1367 	}
1368 
1369 	pool_put(&exec_pool, data->ed_argp);
1370 
1371 	/*
1372 	 * Notify anyone who might care that we've exec'd.
1373 	 *
1374 	 * This is slightly racy; someone could sneak in and
1375 	 * attach a knote after we've decided not to notify,
1376 	 * or vice-versa, but that's not particularly bothersome.
1377 	 * knote_proc_exec() will acquire p->p_lock as needed.
1378 	 */
1379 	if (!SLIST_EMPTY(&p->p_klist)) {
1380 		knote_proc_exec(p);
1381 	}
1382 
1383 	kmem_free(epp->ep_hdr, epp->ep_hdrlen);
1384 
1385 	SDT_PROBE(proc, kernel, , exec__success, epp->ep_kname, 0, 0, 0, 0);
1386 
1387 	emulexec(l, epp);
1388 
1389 	/* Allow new references from the debugger/procfs. */
1390 	rw_exit(&p->p_reflock);
1391 	if (!no_local_exec_lock)
1392 		rw_exit(&exec_lock);
1393 
1394 	mutex_enter(&proc_lock);
1395 
1396 	/* posix_spawn(3) reports a single event with implied exec(3) */
1397 	if ((p->p_slflag & PSL_TRACED) && !is_spawn) {
1398 		mutex_enter(p->p_lock);
1399 		eventswitch(TRAP_EXEC, 0, 0);
1400 		mutex_enter(&proc_lock);
1401 	}
1402 
1403 	if (p->p_sflag & PS_STOPEXEC) {
1404 		ksiginfoq_t kq;
1405 
1406 		KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
1407 		p->p_pptr->p_nstopchild++;
1408 		p->p_waited = 0;
1409 		mutex_enter(p->p_lock);
1410 		ksiginfo_queue_init(&kq);
1411 		sigclearall(p, &contsigmask, &kq);
1412 		lwp_lock(l);
1413 		l->l_stat = LSSTOP;
1414 		p->p_stat = SSTOP;
1415 		p->p_nrlwps--;
1416 		lwp_unlock(l);
1417 		mutex_exit(p->p_lock);
1418 		mutex_exit(&proc_lock);
1419 		lwp_lock(l);
1420 		spc_lock(l->l_cpu);
1421 		mi_switch(l);
1422 		ksiginfo_queue_drain(&kq);
1423 	} else {
1424 		mutex_exit(&proc_lock);
1425 	}
1426 
1427 	exec_path_free(data);
1428 #ifdef TRACE_EXEC
1429 	DPRINTF(("%s finished\n", __func__));
1430 #endif
1431 	return EJUSTRETURN;
1432 
1433  exec_abort:
1434 	SDT_PROBE(proc, kernel, , exec__failure, error, 0, 0, 0, 0);
1435 	rw_exit(&p->p_reflock);
1436 	if (!no_local_exec_lock)
1437 		rw_exit(&exec_lock);
1438 
1439 	exec_path_free(data);
1440 
1441 	/*
1442 	 * the old process doesn't exist anymore.  exit gracefully.
1443 	 * get rid of the (new) address space we have created, if any, get rid
1444 	 * of our namei data and vnode, and exit noting failure
1445 	 */
1446 	if (vm != NULL) {
1447 		uvm_deallocate(&vm->vm_map, VM_MIN_ADDRESS,
1448 			VM_MAXUSER_ADDRESS - VM_MIN_ADDRESS);
1449 	}
1450 
1451 	exec_free_emul_arg(epp);
1452 	pool_put(&exec_pool, data->ed_argp);
1453 	kmem_free(epp->ep_hdr, epp->ep_hdrlen);
1454 	if (epp->ep_emul_root != NULL)
1455 		vrele(epp->ep_emul_root);
1456 	if (epp->ep_interp != NULL)
1457 		vrele(epp->ep_interp);
1458 
1459 	/* Acquire the sched-state mutex (exit1() will release it). */
1460 	if (!is_spawn) {
1461 		mutex_enter(p->p_lock);
1462 		exit1(l, error, SIGABRT);
1463 	}
1464 
1465 	return error;
1466 }
1467 
1468 int
1469 execve1(struct lwp *l, bool has_path, const char *path, int fd,
1470     char * const *args, char * const *envs,
1471     execve_fetch_element_t fetch_element)
1472 {
1473 	struct execve_data data;
1474 	int error;
1475 
1476 	error = execve_loadvm(l, has_path, path, fd, args, envs, fetch_element,
1477 	    &data);
1478 	if (error)
1479 		return error;
1480 	error = execve_runproc(l, &data, false, false);
1481 	return error;
1482 }
1483 
1484 static size_t
1485 fromptrsz(const struct exec_package *epp)
1486 {
1487 	return (epp->ep_flags & EXEC_FROM32) ? sizeof(int) : sizeof(char *);
1488 }
1489 
1490 static size_t
1491 ptrsz(const struct exec_package *epp)
1492 {
1493 	return (epp->ep_flags & EXEC_32) ? sizeof(int) : sizeof(char *);
1494 }
1495 
1496 static size_t
1497 calcargs(struct execve_data * restrict data, const size_t argenvstrlen)
1498 {
1499 	struct exec_package	* const epp = &data->ed_pack;
1500 
1501 	const size_t nargenvptrs =
1502 	    1 +				/* long argc */
1503 	    data->ed_argc +		/* char *argv[] */
1504 	    1 +				/* \0 */
1505 	    data->ed_envc +		/* char *env[] */
1506 	    1;				/* \0 */
1507 
1508 	return (nargenvptrs * ptrsz(epp))	/* pointers */
1509 	    + argenvstrlen			/* strings */
1510 	    + epp->ep_esch->es_arglen;		/* auxinfo */
1511 }
1512 
1513 static size_t
1514 calcstack(struct execve_data * restrict data, const size_t gaplen)
1515 {
1516 	struct exec_package	* const epp = &data->ed_pack;
1517 
1518 	data->ed_szsigcode = epp->ep_esch->es_emul->e_esigcode -
1519 	    epp->ep_esch->es_emul->e_sigcode;
1520 
1521 	data->ed_ps_strings_sz = (epp->ep_flags & EXEC_32) ?
1522 	    sizeof(struct ps_strings32) : sizeof(struct ps_strings);
1523 
1524 	const size_t sigcode_psstr_sz =
1525 	    data->ed_szsigcode +	/* sigcode */
1526 	    data->ed_ps_strings_sz +	/* ps_strings */
1527 	    STACK_PTHREADSPACE;		/* pthread space */
1528 
1529 	const size_t stacklen =
1530 	    data->ed_argslen +
1531 	    gaplen +
1532 	    sigcode_psstr_sz;
1533 
1534 	/* make the stack "safely" aligned */
1535 	return STACK_LEN_ALIGN(stacklen, STACK_ALIGNBYTES);
1536 }
1537 
1538 static int
1539 copyoutargs(struct execve_data * restrict data, struct lwp *l,
1540     char * const newstack)
1541 {
1542 	struct exec_package	* const epp = &data->ed_pack;
1543 	struct proc		*p = l->l_proc;
1544 	int			error;
1545 
1546 	memset(&data->ed_arginfo, 0, sizeof(data->ed_arginfo));
1547 
1548 	/* remember information about the process */
1549 	data->ed_arginfo.ps_nargvstr = data->ed_argc;
1550 	data->ed_arginfo.ps_nenvstr = data->ed_envc;
1551 
1552 	/*
1553 	 * Allocate the stack address passed to the newly execve()'ed process.
1554 	 *
1555 	 * The new stack address will be set to the SP (stack pointer) register
1556 	 * in setregs().
1557 	 */
1558 
1559 	char *newargs = STACK_ALLOC(
1560 	    STACK_SHRINK(newstack, data->ed_argslen), data->ed_argslen);
1561 
1562 	error = (*epp->ep_esch->es_copyargs)(l, epp,
1563 	    &data->ed_arginfo, &newargs, data->ed_argp);
1564 
1565 	if (error) {
1566 		DPRINTF(("%s: copyargs failed %d\n", __func__, error));
1567 		return error;
1568 	}
1569 
1570 	error = copyoutpsstrs(data, p);
1571 	if (error != 0)
1572 		return error;
1573 
1574 	return 0;
1575 }
1576 
1577 static int
1578 copyoutpsstrs(struct execve_data * restrict data, struct proc *p)
1579 {
1580 	struct exec_package	* const epp = &data->ed_pack;
1581 	struct ps_strings32	arginfo32;
1582 	void			*aip;
1583 	int			error;
1584 
1585 	/* fill process ps_strings info */
1586 	p->p_psstrp = (vaddr_t)STACK_ALLOC(STACK_GROW(epp->ep_minsaddr,
1587 	    STACK_PTHREADSPACE), data->ed_ps_strings_sz);
1588 
1589 	if (epp->ep_flags & EXEC_32) {
1590 		aip = &arginfo32;
1591 		arginfo32.ps_argvstr = (vaddr_t)data->ed_arginfo.ps_argvstr;
1592 		arginfo32.ps_nargvstr = data->ed_arginfo.ps_nargvstr;
1593 		arginfo32.ps_envstr = (vaddr_t)data->ed_arginfo.ps_envstr;
1594 		arginfo32.ps_nenvstr = data->ed_arginfo.ps_nenvstr;
1595 	} else
1596 		aip = &data->ed_arginfo;
1597 
1598 	/* copy out the process's ps_strings structure */
1599 	if ((error = copyout(aip, (void *)p->p_psstrp, data->ed_ps_strings_sz))
1600 	    != 0) {
1601 		DPRINTF(("%s: ps_strings copyout %p->%p size %zu failed\n",
1602 		    __func__, aip, (void *)p->p_psstrp, data->ed_ps_strings_sz));
1603 		return error;
1604 	}
1605 
1606 	return 0;
1607 }
1608 
1609 static int
1610 copyinargs(struct execve_data * restrict data, char * const *args,
1611     char * const *envs, execve_fetch_element_t fetch_element, char **dpp)
1612 {
1613 	struct exec_package	* const epp = &data->ed_pack;
1614 	char			*dp;
1615 	size_t			i;
1616 	int			error;
1617 
1618 	dp = *dpp;
1619 
1620 	data->ed_argc = 0;
1621 
1622 	/* copy the fake args list, if there's one, freeing it as we go */
1623 	if (epp->ep_flags & EXEC_HASARGL) {
1624 		struct exec_fakearg	*fa = epp->ep_fa;
1625 
1626 		while (fa->fa_arg != NULL) {
1627 			const size_t maxlen = ARG_MAX - (dp - data->ed_argp);
1628 			size_t len;
1629 
1630 			len = strlcpy(dp, fa->fa_arg, maxlen);
1631 			/* Count NUL into len. */
1632 			if (len < maxlen)
1633 				len++;
1634 			else {
1635 				while (fa->fa_arg != NULL) {
1636 					kmem_free(fa->fa_arg, fa->fa_len);
1637 					fa++;
1638 				}
1639 				kmem_free(epp->ep_fa, epp->ep_fa_len);
1640 				epp->ep_flags &= ~EXEC_HASARGL;
1641 				return E2BIG;
1642 			}
1643 			ktrexecarg(fa->fa_arg, len - 1);
1644 			dp += len;
1645 
1646 			kmem_free(fa->fa_arg, fa->fa_len);
1647 			fa++;
1648 			data->ed_argc++;
1649 		}
1650 		kmem_free(epp->ep_fa, epp->ep_fa_len);
1651 		epp->ep_flags &= ~EXEC_HASARGL;
1652 	}
1653 
1654 	/*
1655 	 * Read and count argument strings from user.
1656 	 */
1657 
1658 	if (args == NULL) {
1659 		DPRINTF(("%s: null args\n", __func__));
1660 		return EINVAL;
1661 	}
1662 	if (epp->ep_flags & EXEC_SKIPARG)
1663 		args = (const void *)((const char *)args + fromptrsz(epp));
1664 	i = 0;
1665 	error = copyinargstrs(data, args, fetch_element, &dp, &i, ktr_execarg);
1666 	if (error != 0) {
1667 		DPRINTF(("%s: copyin arg %d\n", __func__, error));
1668 		return error;
1669 	}
1670 	data->ed_argc += i;
1671 
1672 	/*
1673 	 * Read and count environment strings from user.
1674 	 */
1675 
1676 	data->ed_envc = 0;
1677 	/* environment need not be there */
1678 	if (envs == NULL)
1679 		goto done;
1680 	i = 0;
1681 	error = copyinargstrs(data, envs, fetch_element, &dp, &i, ktr_execenv);
1682 	if (error != 0) {
1683 		DPRINTF(("%s: copyin env %d\n", __func__, error));
1684 		return error;
1685 	}
1686 	data->ed_envc += i;
1687 
1688 done:
1689 	*dpp = dp;
1690 
1691 	return 0;
1692 }
1693 
1694 static int
1695 copyinargstrs(struct execve_data * restrict data, char * const *strs,
1696     execve_fetch_element_t fetch_element, char **dpp, size_t *ip,
1697     void (*ktr)(const void *, size_t))
1698 {
1699 	char			*dp, *sp;
1700 	size_t			i;
1701 	int			error;
1702 
1703 	dp = *dpp;
1704 
1705 	i = 0;
1706 	while (1) {
1707 		const size_t maxlen = ARG_MAX - (dp - data->ed_argp);
1708 		size_t len;
1709 
1710 		if ((error = (*fetch_element)(strs, i, &sp)) != 0) {
1711 			return error;
1712 		}
1713 		if (!sp)
1714 			break;
1715 		if ((error = copyinstr(sp, dp, maxlen, &len)) != 0) {
1716 			if (error == ENAMETOOLONG)
1717 				error = E2BIG;
1718 			return error;
1719 		}
1720 		if (__predict_false(ktrace_on))
1721 			(*ktr)(dp, len - 1);
1722 		dp += len;
1723 		i++;
1724 	}
1725 
1726 	*dpp = dp;
1727 	*ip = i;
1728 
1729 	return 0;
1730 }
1731 
1732 /*
1733  * Copy argv and env strings from kernel buffer (argp) to the new stack.
1734  * Those strings are located just after auxinfo.
1735  */
1736 int
1737 copyargs(struct lwp *l, struct exec_package *pack, struct ps_strings *arginfo,
1738     char **stackp, void *argp)
1739 {
1740 	char	**cpp, *dp, *sp;
1741 	size_t	len;
1742 	void	*nullp;
1743 	long	argc, envc;
1744 	int	error;
1745 
1746 	cpp = (char **)*stackp;
1747 	nullp = NULL;
1748 	argc = arginfo->ps_nargvstr;
1749 	envc = arginfo->ps_nenvstr;
1750 
1751 	/* argc on stack is long */
1752 	CTASSERT(sizeof(*cpp) == sizeof(argc));
1753 
1754 	dp = (char *)(cpp +
1755 	    1 +				/* long argc */
1756 	    argc +			/* char *argv[] */
1757 	    1 +				/* \0 */
1758 	    envc +			/* char *env[] */
1759 	    1) +			/* \0 */
1760 	    pack->ep_esch->es_arglen;	/* auxinfo */
1761 	sp = argp;
1762 
1763 	if ((error = copyout(&argc, cpp++, sizeof(argc))) != 0) {
1764 		COPYPRINTF("", cpp - 1, sizeof(argc));
1765 		return error;
1766 	}
1767 
1768 	/* XXX don't copy them out, remap them! */
1769 	arginfo->ps_argvstr = cpp; /* remember location of argv for later */
1770 
1771 	for (; --argc >= 0; sp += len, dp += len) {
1772 		if ((error = copyout(&dp, cpp++, sizeof(dp))) != 0) {
1773 			COPYPRINTF("", cpp - 1, sizeof(dp));
1774 			return error;
1775 		}
1776 		if ((error = copyoutstr(sp, dp, ARG_MAX, &len)) != 0) {
1777 			COPYPRINTF("str", dp, (size_t)ARG_MAX);
1778 			return error;
1779 		}
1780 	}
1781 
1782 	if ((error = copyout(&nullp, cpp++, sizeof(nullp))) != 0) {
1783 		COPYPRINTF("", cpp - 1, sizeof(nullp));
1784 		return error;
1785 	}
1786 
1787 	arginfo->ps_envstr = cpp; /* remember location of envp for later */
1788 
1789 	for (; --envc >= 0; sp += len, dp += len) {
1790 		if ((error = copyout(&dp, cpp++, sizeof(dp))) != 0) {
1791 			COPYPRINTF("", cpp - 1, sizeof(dp));
1792 			return error;
1793 		}
1794 		if ((error = copyoutstr(sp, dp, ARG_MAX, &len)) != 0) {
1795 			COPYPRINTF("str", dp, (size_t)ARG_MAX);
1796 			return error;
1797 		}
1798 
1799 	}
1800 
1801 	if ((error = copyout(&nullp, cpp++, sizeof(nullp))) != 0) {
1802 		COPYPRINTF("", cpp - 1, sizeof(nullp));
1803 		return error;
1804 	}
1805 
1806 	*stackp = (char *)cpp;
1807 	return 0;
1808 }
1809 
1810 
1811 /*
1812  * Add execsw[] entries.
1813  */
1814 int
1815 exec_add(struct execsw *esp, int count)
1816 {
1817 	struct exec_entry	*it;
1818 	int			i;
1819 
1820 	if (count == 0) {
1821 		return 0;
1822 	}
1823 
1824 	/* Check for duplicates. */
1825 	rw_enter(&exec_lock, RW_WRITER);
1826 	for (i = 0; i < count; i++) {
1827 		LIST_FOREACH(it, &ex_head, ex_list) {
1828 			/* assume unique (makecmds, probe_func, emulation) */
1829 			if (it->ex_sw->es_makecmds == esp[i].es_makecmds &&
1830 			    it->ex_sw->u.elf_probe_func ==
1831 			    esp[i].u.elf_probe_func &&
1832 			    it->ex_sw->es_emul == esp[i].es_emul) {
1833 				rw_exit(&exec_lock);
1834 				return EEXIST;
1835 			}
1836 		}
1837 	}
1838 
1839 	/* Allocate new entries. */
1840 	for (i = 0; i < count; i++) {
1841 		it = kmem_alloc(sizeof(*it), KM_SLEEP);
1842 		it->ex_sw = &esp[i];
1843 		LIST_INSERT_HEAD(&ex_head, it, ex_list);
1844 	}
1845 
1846 	/* update execsw[] */
1847 	exec_init(0);
1848 	rw_exit(&exec_lock);
1849 	return 0;
1850 }
1851 
1852 /*
1853  * Remove execsw[] entry.
1854  */
1855 int
1856 exec_remove(struct execsw *esp, int count)
1857 {
1858 	struct exec_entry	*it, *next;
1859 	int			i;
1860 	const struct proclist_desc *pd;
1861 	proc_t			*p;
1862 
1863 	if (count == 0) {
1864 		return 0;
1865 	}
1866 
1867 	/* Abort if any are busy. */
1868 	rw_enter(&exec_lock, RW_WRITER);
1869 	for (i = 0; i < count; i++) {
1870 		mutex_enter(&proc_lock);
1871 		for (pd = proclists; pd->pd_list != NULL; pd++) {
1872 			PROCLIST_FOREACH(p, pd->pd_list) {
1873 				if (p->p_execsw == &esp[i]) {
1874 					mutex_exit(&proc_lock);
1875 					rw_exit(&exec_lock);
1876 					return EBUSY;
1877 				}
1878 			}
1879 		}
1880 		mutex_exit(&proc_lock);
1881 	}
1882 
1883 	/* None are busy, so remove them all. */
1884 	for (i = 0; i < count; i++) {
1885 		for (it = LIST_FIRST(&ex_head); it != NULL; it = next) {
1886 			next = LIST_NEXT(it, ex_list);
1887 			if (it->ex_sw == &esp[i]) {
1888 				LIST_REMOVE(it, ex_list);
1889 				kmem_free(it, sizeof(*it));
1890 				break;
1891 			}
1892 		}
1893 	}
1894 
1895 	/* update execsw[] */
1896 	exec_init(0);
1897 	rw_exit(&exec_lock);
1898 	return 0;
1899 }
1900 
1901 /*
1902  * Initialize exec structures. If init_boot is true, also does necessary
1903  * one-time initialization (it's called from main() that way).
1904  * Once system is multiuser, this should be called with exec_lock held,
1905  * i.e. via exec_{add|remove}().
1906  */
1907 int
1908 exec_init(int init_boot)
1909 {
1910 	const struct execsw 	**sw;
1911 	struct exec_entry	*ex;
1912 	SLIST_HEAD(,exec_entry)	first;
1913 	SLIST_HEAD(,exec_entry)	any;
1914 	SLIST_HEAD(,exec_entry)	last;
1915 	int			i, sz;
1916 
1917 	if (init_boot) {
1918 		/* do one-time initializations */
1919 		vaddr_t vmin = 0, vmax;
1920 
1921 		rw_init(&exec_lock);
1922 		mutex_init(&sigobject_lock, MUTEX_DEFAULT, IPL_NONE);
1923 		exec_map = uvm_km_suballoc(kernel_map, &vmin, &vmax,
1924 		    maxexec*NCARGS, VM_MAP_PAGEABLE, false, NULL);
1925 		pool_init(&exec_pool, NCARGS, 0, 0, PR_NOALIGN|PR_NOTOUCH,
1926 		    "execargs", &exec_palloc, IPL_NONE);
1927 		pool_sethardlimit(&exec_pool, maxexec, "should not happen", 0);
1928 	} else {
1929 		KASSERT(rw_write_held(&exec_lock));
1930 	}
1931 
1932 	/* Sort each entry onto the appropriate queue. */
1933 	SLIST_INIT(&first);
1934 	SLIST_INIT(&any);
1935 	SLIST_INIT(&last);
1936 	sz = 0;
1937 	LIST_FOREACH(ex, &ex_head, ex_list) {
1938 		switch(ex->ex_sw->es_prio) {
1939 		case EXECSW_PRIO_FIRST:
1940 			SLIST_INSERT_HEAD(&first, ex, ex_slist);
1941 			break;
1942 		case EXECSW_PRIO_ANY:
1943 			SLIST_INSERT_HEAD(&any, ex, ex_slist);
1944 			break;
1945 		case EXECSW_PRIO_LAST:
1946 			SLIST_INSERT_HEAD(&last, ex, ex_slist);
1947 			break;
1948 		default:
1949 			panic("%s", __func__);
1950 			break;
1951 		}
1952 		sz++;
1953 	}
1954 
1955 	/*
1956 	 * Create new execsw[].  Ensure we do not try a zero-sized
1957 	 * allocation.
1958 	 */
1959 	sw = kmem_alloc(sz * sizeof(struct execsw *) + 1, KM_SLEEP);
1960 	i = 0;
1961 	SLIST_FOREACH(ex, &first, ex_slist) {
1962 		sw[i++] = ex->ex_sw;
1963 	}
1964 	SLIST_FOREACH(ex, &any, ex_slist) {
1965 		sw[i++] = ex->ex_sw;
1966 	}
1967 	SLIST_FOREACH(ex, &last, ex_slist) {
1968 		sw[i++] = ex->ex_sw;
1969 	}
1970 
1971 	/* Replace old execsw[] and free used memory. */
1972 	if (execsw != NULL) {
1973 		kmem_free(__UNCONST(execsw),
1974 		    nexecs * sizeof(struct execsw *) + 1);
1975 	}
1976 	execsw = sw;
1977 	nexecs = sz;
1978 
1979 	/* Figure out the maximum size of an exec header. */
1980 	exec_maxhdrsz = sizeof(int);
1981 	for (i = 0; i < nexecs; i++) {
1982 		if (execsw[i]->es_hdrsz > exec_maxhdrsz)
1983 			exec_maxhdrsz = execsw[i]->es_hdrsz;
1984 	}
1985 
1986 	return 0;
1987 }
1988 
1989 static int
1990 exec_sigcode_map(struct proc *p, const struct emul *e)
1991 {
1992 	vaddr_t va;
1993 	vsize_t sz;
1994 	int error;
1995 	struct uvm_object *uobj;
1996 
1997 	sz = (vaddr_t)e->e_esigcode - (vaddr_t)e->e_sigcode;
1998 
1999 	if (e->e_sigobject == NULL || sz == 0) {
2000 		return 0;
2001 	}
2002 
2003 	/*
2004 	 * If we don't have a sigobject for this emulation, create one.
2005 	 *
2006 	 * sigobject is an anonymous memory object (just like SYSV shared
2007 	 * memory) that we keep a permanent reference to and that we map
2008 	 * in all processes that need this sigcode. The creation is simple,
2009 	 * we create an object, add a permanent reference to it, map it in
2010 	 * kernel space, copy out the sigcode to it and unmap it.
2011 	 * We map it with PROT_READ|PROT_EXEC into the process just
2012 	 * the way sys_mmap() would map it.
2013 	 */
2014 
2015 	uobj = *e->e_sigobject;
2016 	if (uobj == NULL) {
2017 		mutex_enter(&sigobject_lock);
2018 		if ((uobj = *e->e_sigobject) == NULL) {
2019 			uobj = uao_create(sz, 0);
2020 			(*uobj->pgops->pgo_reference)(uobj);
2021 			va = vm_map_min(kernel_map);
2022 			if ((error = uvm_map(kernel_map, &va, round_page(sz),
2023 			    uobj, 0, 0,
2024 			    UVM_MAPFLAG(UVM_PROT_RW, UVM_PROT_RW,
2025 			    UVM_INH_SHARE, UVM_ADV_RANDOM, 0)))) {
2026 				printf("kernel mapping failed %d\n", error);
2027 				(*uobj->pgops->pgo_detach)(uobj);
2028 				mutex_exit(&sigobject_lock);
2029 				return error;
2030 			}
2031 			memcpy((void *)va, e->e_sigcode, sz);
2032 #ifdef PMAP_NEED_PROCWR
2033 			pmap_procwr(&proc0, va, sz);
2034 #endif
2035 			uvm_unmap(kernel_map, va, va + round_page(sz));
2036 			*e->e_sigobject = uobj;
2037 		}
2038 		mutex_exit(&sigobject_lock);
2039 	}
2040 
2041 	/* Just a hint to uvm_map where to put it. */
2042 	va = e->e_vm_default_addr(p, (vaddr_t)p->p_vmspace->vm_daddr,
2043 	    round_page(sz), p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN);
2044 
2045 #ifdef __alpha__
2046 	/*
2047 	 * Tru64 puts /sbin/loader at the end of user virtual memory,
2048 	 * which causes the above calculation to put the sigcode at
2049 	 * an invalid address.  Put it just below the text instead.
2050 	 */
2051 	if (va == (vaddr_t)vm_map_max(&p->p_vmspace->vm_map)) {
2052 		va = (vaddr_t)p->p_vmspace->vm_taddr - round_page(sz);
2053 	}
2054 #endif
2055 
2056 	(*uobj->pgops->pgo_reference)(uobj);
2057 	error = uvm_map(&p->p_vmspace->vm_map, &va, round_page(sz),
2058 			uobj, 0, 0,
2059 			UVM_MAPFLAG(UVM_PROT_RX, UVM_PROT_RX, UVM_INH_SHARE,
2060 				    UVM_ADV_RANDOM, 0));
2061 	if (error) {
2062 		DPRINTF(("%s, %d: map %p "
2063 		    "uvm_map %#"PRIxVSIZE"@%#"PRIxVADDR" failed %d\n",
2064 		    __func__, __LINE__, &p->p_vmspace->vm_map, round_page(sz),
2065 		    va, error));
2066 		(*uobj->pgops->pgo_detach)(uobj);
2067 		return error;
2068 	}
2069 	p->p_sigctx.ps_sigcode = (void *)va;
2070 	return 0;
2071 }
2072 
2073 /*
2074  * Release a refcount on spawn_exec_data and destroy memory, if this
2075  * was the last one.
2076  */
2077 static void
2078 spawn_exec_data_release(struct spawn_exec_data *data)
2079 {
2080 	if (atomic_dec_32_nv(&data->sed_refcnt) != 0)
2081 		return;
2082 
2083 	cv_destroy(&data->sed_cv_child_ready);
2084 	mutex_destroy(&data->sed_mtx_child);
2085 
2086 	if (data->sed_actions)
2087 		posix_spawn_fa_free(data->sed_actions,
2088 		    data->sed_actions->len);
2089 	if (data->sed_attrs)
2090 		kmem_free(data->sed_attrs,
2091 		    sizeof(*data->sed_attrs));
2092 	kmem_free(data, sizeof(*data));
2093 }
2094 
2095 static int
2096 handle_posix_spawn_file_actions(struct posix_spawn_file_actions *actions)
2097 {
2098 	struct lwp *l = curlwp;
2099 	register_t retval;
2100 	int error, newfd;
2101 
2102 	if (actions == NULL)
2103 		return 0;
2104 
2105 	for (size_t i = 0; i < actions->len; i++) {
2106 		const struct posix_spawn_file_actions_entry *fae =
2107 		    &actions->fae[i];
2108 		switch (fae->fae_action) {
2109 		case FAE_OPEN:
2110 			if (fd_getfile(fae->fae_fildes) != NULL) {
2111 				error = fd_close(fae->fae_fildes);
2112 				if (error)
2113 					return error;
2114 			}
2115 			error = fd_open(fae->fae_path, fae->fae_oflag,
2116 			    fae->fae_mode, &newfd);
2117 			if (error)
2118 				return error;
2119 			if (newfd != fae->fae_fildes) {
2120 				error = dodup(l, newfd,
2121 				    fae->fae_fildes, 0, &retval);
2122 				if (fd_getfile(newfd) != NULL)
2123 					fd_close(newfd);
2124 			}
2125 			break;
2126 		case FAE_DUP2:
2127 			error = dodup(l, fae->fae_fildes,
2128 			    fae->fae_newfildes, 0, &retval);
2129 			break;
2130 		case FAE_CLOSE:
2131 			if (fd_getfile(fae->fae_fildes) == NULL) {
2132 				return EBADF;
2133 			}
2134 			error = fd_close(fae->fae_fildes);
2135 			break;
2136 		case FAE_CHDIR:
2137 			error = do_sys_chdir(l, fae->fae_chdir_path,
2138 			    UIO_SYSSPACE, &retval);
2139 			break;
2140 		case FAE_FCHDIR:
2141 			error = do_sys_fchdir(l, fae->fae_fildes, &retval);
2142 			break;
2143 		}
2144 		if (error)
2145 			return error;
2146 	}
2147 	return 0;
2148 }
2149 
2150 static int
2151 handle_posix_spawn_attrs(struct posix_spawnattr *attrs, struct proc *parent)
2152 {
2153 	struct sigaction sigact;
2154 	int error;
2155 	struct proc *p = curproc;
2156 	struct lwp *l = curlwp;
2157 
2158 	if (attrs == NULL)
2159 		return 0;
2160 
2161 	memset(&sigact, 0, sizeof(sigact));
2162 	sigact._sa_u._sa_handler = SIG_DFL;
2163 	sigact.sa_flags = 0;
2164 
2165 	/*
2166 	 * set state to SSTOP so that this proc can be found by pid.
2167 	 * see proc_enterprp, do_sched_setparam below
2168 	 */
2169 	mutex_enter(&proc_lock);
2170 	/*
2171 	 * p_stat should be SACTIVE, so we need to adjust the
2172 	 * parent's p_nstopchild here.  For safety, just make
2173 	 * we're on the good side of SDEAD before we adjust.
2174 	 */
2175 	int ostat = p->p_stat;
2176 	KASSERT(ostat < SSTOP);
2177 	p->p_stat = SSTOP;
2178 	p->p_waited = 0;
2179 	p->p_pptr->p_nstopchild++;
2180 	mutex_exit(&proc_lock);
2181 
2182 	/* Set process group */
2183 	if (attrs->sa_flags & POSIX_SPAWN_SETPGROUP) {
2184 		pid_t mypid = p->p_pid;
2185 		pid_t pgrp = attrs->sa_pgroup;
2186 
2187 		if (pgrp == 0)
2188 			pgrp = mypid;
2189 
2190 		error = proc_enterpgrp(parent, mypid, pgrp, false);
2191 		if (error)
2192 			goto out;
2193 	}
2194 
2195 	/* Set scheduler policy */
2196 	if (attrs->sa_flags & POSIX_SPAWN_SETSCHEDULER)
2197 		error = do_sched_setparam(p->p_pid, 0, attrs->sa_schedpolicy,
2198 		    &attrs->sa_schedparam);
2199 	else if (attrs->sa_flags & POSIX_SPAWN_SETSCHEDPARAM) {
2200 		error = do_sched_setparam(parent->p_pid, 0,
2201 		    SCHED_NONE, &attrs->sa_schedparam);
2202 	}
2203 	if (error)
2204 		goto out;
2205 
2206 	/* Reset user ID's */
2207 	if (attrs->sa_flags & POSIX_SPAWN_RESETIDS) {
2208 		error = do_setresgid(l, -1, kauth_cred_getgid(l->l_cred), -1,
2209 		     ID_E_EQ_R | ID_E_EQ_S);
2210 		if (error)
2211 			return error;
2212 		error = do_setresuid(l, -1, kauth_cred_getuid(l->l_cred), -1,
2213 		    ID_E_EQ_R | ID_E_EQ_S);
2214 		if (error)
2215 			goto out;
2216 	}
2217 
2218 	/* Set signal masks/defaults */
2219 	if (attrs->sa_flags & POSIX_SPAWN_SETSIGMASK) {
2220 		mutex_enter(p->p_lock);
2221 		error = sigprocmask1(l, SIG_SETMASK, &attrs->sa_sigmask, NULL);
2222 		mutex_exit(p->p_lock);
2223 		if (error)
2224 			goto out;
2225 	}
2226 
2227 	if (attrs->sa_flags & POSIX_SPAWN_SETSIGDEF) {
2228 		/*
2229 		 * The following sigaction call is using a sigaction
2230 		 * version 0 trampoline which is in the compatibility
2231 		 * code only. This is not a problem because for SIG_DFL
2232 		 * and SIG_IGN, the trampolines are now ignored. If they
2233 		 * were not, this would be a problem because we are
2234 		 * holding the exec_lock, and the compat code needs
2235 		 * to do the same in order to replace the trampoline
2236 		 * code of the process.
2237 		 */
2238 		for (int i = 1; i <= NSIG; i++) {
2239 			if (sigismember(&attrs->sa_sigdefault, i))
2240 				sigaction1(l, i, &sigact, NULL, NULL, 0);
2241 		}
2242 	}
2243 	error = 0;
2244 out:
2245 	mutex_enter(&proc_lock);
2246 	p->p_stat = ostat;
2247 	p->p_pptr->p_nstopchild--;
2248 	mutex_exit(&proc_lock);
2249 	return error;
2250 }
2251 
2252 /*
2253  * A child lwp of a posix_spawn operation starts here and ends up in
2254  * cpu_spawn_return, dealing with all filedescriptor and scheduler
2255  * manipulations in between.
2256  * The parent waits for the child, as it is not clear whether the child
2257  * will be able to acquire its own exec_lock. If it can, the parent can
2258  * be released early and continue running in parallel. If not (or if the
2259  * magic debug flag is passed in the scheduler attribute struct), the
2260  * child rides on the parent's exec lock until it is ready to return to
2261  * to userland - and only then releases the parent. This method loses
2262  * concurrency, but improves error reporting.
2263  */
2264 static void
2265 spawn_return(void *arg)
2266 {
2267 	struct spawn_exec_data *spawn_data = arg;
2268 	struct lwp *l = curlwp;
2269 	struct proc *p = l->l_proc;
2270 	int error;
2271 	bool have_reflock;
2272 	bool parent_is_waiting = true;
2273 
2274 	/*
2275 	 * Check if we can release parent early.
2276 	 * We either need to have no sed_attrs, or sed_attrs does not
2277 	 * have POSIX_SPAWN_RETURNERROR or one of the flags, that require
2278 	 * safe access to the parent proc (passed in sed_parent).
2279 	 * We then try to get the exec_lock, and only if that works, we can
2280 	 * release the parent here already.
2281 	 */
2282 	struct posix_spawnattr *attrs = spawn_data->sed_attrs;
2283 	if ((!attrs || (attrs->sa_flags
2284 		& (POSIX_SPAWN_RETURNERROR|POSIX_SPAWN_SETPGROUP)) == 0)
2285 	    && rw_tryenter(&exec_lock, RW_READER)) {
2286 		parent_is_waiting = false;
2287 		mutex_enter(&spawn_data->sed_mtx_child);
2288 		cv_signal(&spawn_data->sed_cv_child_ready);
2289 		mutex_exit(&spawn_data->sed_mtx_child);
2290 	}
2291 
2292 	/* don't allow debugger access yet */
2293 	rw_enter(&p->p_reflock, RW_WRITER);
2294 	have_reflock = true;
2295 
2296 	/* handle posix_spawnattr */
2297 	error = handle_posix_spawn_attrs(attrs, spawn_data->sed_parent);
2298 	if (error)
2299 		goto report_error;
2300 
2301 	/* handle posix_spawn_file_actions */
2302 	error = handle_posix_spawn_file_actions(spawn_data->sed_actions);
2303 	if (error)
2304 		goto report_error;
2305 
2306 	/* now do the real exec */
2307 	error = execve_runproc(l, &spawn_data->sed_exec, parent_is_waiting,
2308 	    true);
2309 	have_reflock = false;
2310 	if (error == EJUSTRETURN)
2311 		error = 0;
2312 	else if (error)
2313 		goto report_error;
2314 
2315 	if (parent_is_waiting) {
2316 		mutex_enter(&spawn_data->sed_mtx_child);
2317 		cv_signal(&spawn_data->sed_cv_child_ready);
2318 		mutex_exit(&spawn_data->sed_mtx_child);
2319 	}
2320 
2321 	/* release our refcount on the data */
2322 	spawn_exec_data_release(spawn_data);
2323 
2324 	if ((p->p_slflag & (PSL_TRACED|PSL_TRACEDCHILD)) ==
2325 	    (PSL_TRACED|PSL_TRACEDCHILD)) {
2326 		eventswitchchild(p, TRAP_CHLD, PTRACE_POSIX_SPAWN);
2327 	}
2328 
2329 	/* and finally: leave to userland for the first time */
2330 	cpu_spawn_return(l);
2331 
2332 	/* NOTREACHED */
2333 	return;
2334 
2335  report_error:
2336 	if (have_reflock) {
2337 		/*
2338 		 * We have not passed through execve_runproc(),
2339 		 * which would have released the p_reflock and also
2340 		 * taken ownership of the sed_exec part of spawn_data,
2341 		 * so release/free both here.
2342 		 */
2343 		rw_exit(&p->p_reflock);
2344 		execve_free_data(&spawn_data->sed_exec);
2345 	}
2346 
2347 	if (parent_is_waiting) {
2348 		/* pass error to parent */
2349 		mutex_enter(&spawn_data->sed_mtx_child);
2350 		spawn_data->sed_error = error;
2351 		cv_signal(&spawn_data->sed_cv_child_ready);
2352 		mutex_exit(&spawn_data->sed_mtx_child);
2353 	} else {
2354 		rw_exit(&exec_lock);
2355 	}
2356 
2357 	/* release our refcount on the data */
2358 	spawn_exec_data_release(spawn_data);
2359 
2360 	/* done, exit */
2361 	mutex_enter(p->p_lock);
2362 	/*
2363 	 * Posix explicitly asks for an exit code of 127 if we report
2364 	 * errors from the child process - so, unfortunately, there
2365 	 * is no way to report a more exact error code.
2366 	 * A NetBSD specific workaround is POSIX_SPAWN_RETURNERROR as
2367 	 * flag bit in the attrp argument to posix_spawn(2), see above.
2368 	 */
2369 	exit1(l, 127, 0);
2370 }
2371 
2372 static __inline char **
2373 posix_spawn_fae_path(struct posix_spawn_file_actions_entry *fae)
2374 {
2375 	switch (fae->fae_action) {
2376 	case FAE_OPEN:
2377 		return &fae->fae_path;
2378 	case FAE_CHDIR:
2379 		return &fae->fae_chdir_path;
2380 	default:
2381 		return NULL;
2382 	}
2383 }
2384 
2385 void
2386 posix_spawn_fa_free(struct posix_spawn_file_actions *fa, size_t len)
2387 {
2388 
2389 	for (size_t i = 0; i < len; i++) {
2390 		char **pathp = posix_spawn_fae_path(&fa->fae[i]);
2391 		if (pathp)
2392 			kmem_strfree(*pathp);
2393 	}
2394 	if (fa->len > 0)
2395 		kmem_free(fa->fae, sizeof(*fa->fae) * fa->len);
2396 	kmem_free(fa, sizeof(*fa));
2397 }
2398 
2399 static int
2400 posix_spawn_fa_alloc(struct posix_spawn_file_actions **fap,
2401     const struct posix_spawn_file_actions *ufa, rlim_t lim)
2402 {
2403 	struct posix_spawn_file_actions *fa;
2404 	struct posix_spawn_file_actions_entry *fae;
2405 	char *pbuf = NULL;
2406 	int error;
2407 	size_t i = 0;
2408 
2409 	fa = kmem_alloc(sizeof(*fa), KM_SLEEP);
2410 	error = copyin(ufa, fa, sizeof(*fa));
2411 	if (error || fa->len == 0) {
2412 		kmem_free(fa, sizeof(*fa));
2413 		return error;	/* 0 if not an error, and len == 0 */
2414 	}
2415 
2416 	if (fa->len > lim) {
2417 		kmem_free(fa, sizeof(*fa));
2418 		return EINVAL;
2419 	}
2420 
2421 	fa->size = fa->len;
2422 	size_t fal = fa->len * sizeof(*fae);
2423 	fae = fa->fae;
2424 	fa->fae = kmem_alloc(fal, KM_SLEEP);
2425 	error = copyin(fae, fa->fae, fal);
2426 	if (error)
2427 		goto out;
2428 
2429 	pbuf = PNBUF_GET();
2430 	for (; i < fa->len; i++) {
2431 		char **pathp = posix_spawn_fae_path(&fa->fae[i]);
2432 		if (pathp == NULL)
2433 			continue;
2434 		error = copyinstr(*pathp, pbuf, MAXPATHLEN, &fal);
2435 		if (error)
2436 			goto out;
2437 		*pathp = kmem_alloc(fal, KM_SLEEP);
2438 		memcpy(*pathp, pbuf, fal);
2439 	}
2440 	PNBUF_PUT(pbuf);
2441 
2442 	*fap = fa;
2443 	return 0;
2444 out:
2445 	if (pbuf)
2446 		PNBUF_PUT(pbuf);
2447 	posix_spawn_fa_free(fa, i);
2448 	return error;
2449 }
2450 
2451 /*
2452  * N.B. increments nprocs upon success.  Callers need to drop nprocs if
2453  * they fail for some other reason.
2454  */
2455 int
2456 check_posix_spawn(struct lwp *l1)
2457 {
2458 	int error, tnprocs, count;
2459 	uid_t uid;
2460 	struct proc *p1;
2461 
2462 	p1 = l1->l_proc;
2463 	uid = kauth_cred_getuid(l1->l_cred);
2464 	tnprocs = atomic_inc_uint_nv(&nprocs);
2465 
2466 	/*
2467 	 * Although process entries are dynamically created, we still keep
2468 	 * a global limit on the maximum number we will create.
2469 	 */
2470 	if (__predict_false(tnprocs >= maxproc))
2471 		error = -1;
2472 	else
2473 		error = kauth_authorize_process(l1->l_cred,
2474 		    KAUTH_PROCESS_FORK, p1, KAUTH_ARG(tnprocs), NULL, NULL);
2475 
2476 	if (error) {
2477 		atomic_dec_uint(&nprocs);
2478 		return EAGAIN;
2479 	}
2480 
2481 	/*
2482 	 * Enforce limits.
2483 	 */
2484 	count = chgproccnt(uid, 1);
2485 	if (kauth_authorize_process(l1->l_cred, KAUTH_PROCESS_RLIMIT,
2486 	     p1, KAUTH_ARG(KAUTH_REQ_PROCESS_RLIMIT_BYPASS),
2487 	     &p1->p_rlimit[RLIMIT_NPROC], KAUTH_ARG(RLIMIT_NPROC)) != 0 &&
2488 	    __predict_false(count > p1->p_rlimit[RLIMIT_NPROC].rlim_cur)) {
2489 		(void)chgproccnt(uid, -1);
2490 		atomic_dec_uint(&nprocs);
2491 		return EAGAIN;
2492 	}
2493 
2494 	return 0;
2495 }
2496 
2497 int
2498 do_posix_spawn(struct lwp *l1, pid_t *pid_res, bool *child_ok, const char *path,
2499 	struct posix_spawn_file_actions *fa,
2500 	struct posix_spawnattr *sa,
2501 	char *const *argv, char *const *envp,
2502 	execve_fetch_element_t fetch)
2503 {
2504 
2505 	struct proc *p1, *p2;
2506 	struct lwp *l2;
2507 	int error;
2508 	struct spawn_exec_data *spawn_data;
2509 	vaddr_t uaddr;
2510 	pid_t pid;
2511 	bool have_exec_lock = false;
2512 
2513 	p1 = l1->l_proc;
2514 
2515 	/* Allocate and init spawn_data */
2516 	spawn_data = kmem_zalloc(sizeof(*spawn_data), KM_SLEEP);
2517 	spawn_data->sed_refcnt = 1; /* only parent so far */
2518 	cv_init(&spawn_data->sed_cv_child_ready, "pspawn");
2519 	mutex_init(&spawn_data->sed_mtx_child, MUTEX_DEFAULT, IPL_NONE);
2520 	mutex_enter(&spawn_data->sed_mtx_child);
2521 
2522 	/*
2523 	 * Do the first part of the exec now, collect state
2524 	 * in spawn_data.
2525 	 */
2526 	error = execve_loadvm(l1, true, path, -1, argv,
2527 	    envp, fetch, &spawn_data->sed_exec);
2528 	if (error == EJUSTRETURN)
2529 		error = 0;
2530 	else if (error)
2531 		goto error_exit;
2532 
2533 	have_exec_lock = true;
2534 
2535 	/*
2536 	 * Allocate virtual address space for the U-area now, while it
2537 	 * is still easy to abort the fork operation if we're out of
2538 	 * kernel virtual address space.
2539 	 */
2540 	uaddr = uvm_uarea_alloc();
2541 	if (__predict_false(uaddr == 0)) {
2542 		error = ENOMEM;
2543 		goto error_exit;
2544 	}
2545 
2546 	/*
2547 	 * Allocate new proc. Borrow proc0 vmspace for it, we will
2548 	 * replace it with its own before returning to userland
2549 	 * in the child.
2550 	 */
2551 	p2 = proc_alloc();
2552 	if (p2 == NULL) {
2553 		/* We were unable to allocate a process ID. */
2554 		error = EAGAIN;
2555 		goto error_exit;
2556 	}
2557 
2558 	/*
2559 	 * This is a point of no return, we will have to go through
2560 	 * the child proc to properly clean it up past this point.
2561 	 */
2562 	pid = p2->p_pid;
2563 
2564 	/*
2565 	 * Make a proc table entry for the new process.
2566 	 * Start by zeroing the section of proc that is zero-initialized,
2567 	 * then copy the section that is copied directly from the parent.
2568 	 */
2569 	memset(&p2->p_startzero, 0,
2570 	    (unsigned) ((char *)&p2->p_endzero - (char *)&p2->p_startzero));
2571 	memcpy(&p2->p_startcopy, &p1->p_startcopy,
2572 	    (unsigned) ((char *)&p2->p_endcopy - (char *)&p2->p_startcopy));
2573 	p2->p_vmspace = proc0.p_vmspace;
2574 
2575 	TAILQ_INIT(&p2->p_sigpend.sp_info);
2576 
2577 	LIST_INIT(&p2->p_lwps);
2578 	LIST_INIT(&p2->p_sigwaiters);
2579 
2580 	/*
2581 	 * Duplicate sub-structures as needed.
2582 	 * Increase reference counts on shared objects.
2583 	 * Inherit flags we want to keep.  The flags related to SIGCHLD
2584 	 * handling are important in order to keep a consistent behaviour
2585 	 * for the child after the fork.  If we are a 32-bit process, the
2586 	 * child will be too.
2587 	 */
2588 	p2->p_flag =
2589 	    p1->p_flag & (PK_SUGID | PK_NOCLDWAIT | PK_CLDSIGIGN | PK_32);
2590 	p2->p_emul = p1->p_emul;
2591 	p2->p_execsw = p1->p_execsw;
2592 
2593 	mutex_init(&p2->p_stmutex, MUTEX_DEFAULT, IPL_HIGH);
2594 	mutex_init(&p2->p_auxlock, MUTEX_DEFAULT, IPL_NONE);
2595 	rw_init(&p2->p_reflock);
2596 	cv_init(&p2->p_waitcv, "wait");
2597 	cv_init(&p2->p_lwpcv, "lwpwait");
2598 
2599 	p2->p_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
2600 
2601 	kauth_proc_fork(p1, p2);
2602 
2603 	p2->p_raslist = NULL;
2604 	p2->p_fd = fd_copy();
2605 
2606 	/* XXX racy */
2607 	p2->p_mqueue_cnt = p1->p_mqueue_cnt;
2608 
2609 	p2->p_cwdi = cwdinit();
2610 
2611 	/*
2612 	 * Note: p_limit (rlimit stuff) is copy-on-write, so normally
2613 	 * we just need increase pl_refcnt.
2614 	 */
2615 	if (!p1->p_limit->pl_writeable) {
2616 		lim_addref(p1->p_limit);
2617 		p2->p_limit = p1->p_limit;
2618 	} else {
2619 		p2->p_limit = lim_copy(p1->p_limit);
2620 	}
2621 
2622 	p2->p_lflag = 0;
2623 	l1->l_vforkwaiting = false;
2624 	p2->p_sflag = 0;
2625 	p2->p_slflag = 0;
2626 	p2->p_pptr = p1;
2627 	p2->p_ppid = p1->p_pid;
2628 	LIST_INIT(&p2->p_children);
2629 
2630 	p2->p_aio = NULL;
2631 
2632 #ifdef KTRACE
2633 	/*
2634 	 * Copy traceflag and tracefile if enabled.
2635 	 * If not inherited, these were zeroed above.
2636 	 */
2637 	if (p1->p_traceflag & KTRFAC_INHERIT) {
2638 		mutex_enter(&ktrace_lock);
2639 		p2->p_traceflag = p1->p_traceflag;
2640 		if ((p2->p_tracep = p1->p_tracep) != NULL)
2641 			ktradref(p2);
2642 		mutex_exit(&ktrace_lock);
2643 	}
2644 #endif
2645 
2646 	/*
2647 	 * Create signal actions for the child process.
2648 	 */
2649 	p2->p_sigacts = sigactsinit(p1, 0);
2650 	mutex_enter(p1->p_lock);
2651 	p2->p_sflag |=
2652 	    (p1->p_sflag & (PS_STOPFORK | PS_STOPEXEC | PS_NOCLDSTOP));
2653 	sched_proc_fork(p1, p2);
2654 	mutex_exit(p1->p_lock);
2655 
2656 	p2->p_stflag = p1->p_stflag;
2657 
2658 	/*
2659 	 * p_stats.
2660 	 * Copy parts of p_stats, and zero out the rest.
2661 	 */
2662 	p2->p_stats = pstatscopy(p1->p_stats);
2663 
2664 	/* copy over machdep flags to the new proc */
2665 	cpu_proc_fork(p1, p2);
2666 
2667 	/*
2668 	 * Prepare remaining parts of spawn data
2669 	 */
2670 	spawn_data->sed_actions = fa;
2671 	spawn_data->sed_attrs = sa;
2672 
2673 	spawn_data->sed_parent = p1;
2674 
2675 	/* create LWP */
2676 	lwp_create(l1, p2, uaddr, 0, NULL, 0, spawn_return, spawn_data,
2677 	    &l2, l1->l_class, &l1->l_sigmask, &l1->l_sigstk);
2678 	l2->l_ctxlink = NULL;	/* reset ucontext link */
2679 
2680 	/*
2681 	 * Copy the credential so other references don't see our changes.
2682 	 * Test to see if this is necessary first, since in the common case
2683 	 * we won't need a private reference.
2684 	 */
2685 	if (kauth_cred_geteuid(l2->l_cred) != kauth_cred_getsvuid(l2->l_cred) ||
2686 	    kauth_cred_getegid(l2->l_cred) != kauth_cred_getsvgid(l2->l_cred)) {
2687 		l2->l_cred = kauth_cred_copy(l2->l_cred);
2688 		kauth_cred_setsvuid(l2->l_cred, kauth_cred_geteuid(l2->l_cred));
2689 		kauth_cred_setsvgid(l2->l_cred, kauth_cred_getegid(l2->l_cred));
2690 	}
2691 
2692 	/* Update the master credentials. */
2693 	if (l2->l_cred != p2->p_cred) {
2694 		kauth_cred_t ocred;
2695 
2696 		kauth_cred_hold(l2->l_cred);
2697 		mutex_enter(p2->p_lock);
2698 		ocred = p2->p_cred;
2699 		p2->p_cred = l2->l_cred;
2700 		mutex_exit(p2->p_lock);
2701 		kauth_cred_free(ocred);
2702 	}
2703 
2704 	*child_ok = true;
2705 	spawn_data->sed_refcnt = 2;	/* child gets it as well */
2706 #if 0
2707 	l2->l_nopreempt = 1; /* start it non-preemptable */
2708 #endif
2709 
2710 	/*
2711 	 * It's now safe for the scheduler and other processes to see the
2712 	 * child process.
2713 	 */
2714 	mutex_enter(&proc_lock);
2715 
2716 	if (p1->p_session->s_ttyvp != NULL && p1->p_lflag & PL_CONTROLT)
2717 		p2->p_lflag |= PL_CONTROLT;
2718 
2719 	LIST_INSERT_HEAD(&p1->p_children, p2, p_sibling);
2720 	p2->p_exitsig = SIGCHLD;	/* signal for parent on exit */
2721 
2722 	if ((p1->p_slflag & (PSL_TRACEPOSIX_SPAWN|PSL_TRACED)) ==
2723 	    (PSL_TRACEPOSIX_SPAWN|PSL_TRACED)) {
2724 		proc_changeparent(p2, p1->p_pptr);
2725 		SET(p2->p_slflag, PSL_TRACEDCHILD);
2726 	}
2727 
2728 	p2->p_oppid = p1->p_pid;  /* Remember the original parent id. */
2729 
2730 	LIST_INSERT_AFTER(p1, p2, p_pglist);
2731 	LIST_INSERT_HEAD(&allproc, p2, p_list);
2732 
2733 	p2->p_trace_enabled = trace_is_enabled(p2);
2734 #ifdef __HAVE_SYSCALL_INTERN
2735 	(*p2->p_emul->e_syscall_intern)(p2);
2736 #endif
2737 
2738 	/*
2739 	 * Make child runnable, set start time, and add to run queue except
2740 	 * if the parent requested the child to start in SSTOP state.
2741 	 */
2742 	mutex_enter(p2->p_lock);
2743 
2744 	getmicrotime(&p2->p_stats->p_start);
2745 
2746 	lwp_lock(l2);
2747 	KASSERT(p2->p_nrlwps == 1);
2748 	KASSERT(l2->l_stat == LSIDL);
2749 	p2->p_nrlwps = 1;
2750 	p2->p_stat = SACTIVE;
2751 	setrunnable(l2);
2752 	/* LWP now unlocked */
2753 
2754 	mutex_exit(p2->p_lock);
2755 	mutex_exit(&proc_lock);
2756 
2757 	cv_wait(&spawn_data->sed_cv_child_ready, &spawn_data->sed_mtx_child);
2758 	error = spawn_data->sed_error;
2759 	mutex_exit(&spawn_data->sed_mtx_child);
2760 	spawn_exec_data_release(spawn_data);
2761 
2762 	rw_exit(&p1->p_reflock);
2763 	rw_exit(&exec_lock);
2764 	have_exec_lock = false;
2765 
2766 	*pid_res = pid;
2767 
2768 	if (error)
2769 		return error;
2770 
2771 	if (p1->p_slflag & PSL_TRACED) {
2772 		/* Paranoid check */
2773 		mutex_enter(&proc_lock);
2774 		if ((p1->p_slflag & (PSL_TRACEPOSIX_SPAWN|PSL_TRACED)) !=
2775 		    (PSL_TRACEPOSIX_SPAWN|PSL_TRACED)) {
2776 			mutex_exit(&proc_lock);
2777 			return 0;
2778 		}
2779 
2780 		mutex_enter(p1->p_lock);
2781 		eventswitch(TRAP_CHLD, PTRACE_POSIX_SPAWN, pid);
2782 	}
2783 	return 0;
2784 
2785  error_exit:
2786 	if (have_exec_lock) {
2787 		execve_free_data(&spawn_data->sed_exec);
2788 		rw_exit(&p1->p_reflock);
2789 		rw_exit(&exec_lock);
2790 	}
2791 	mutex_exit(&spawn_data->sed_mtx_child);
2792 	spawn_exec_data_release(spawn_data);
2793 
2794 	return error;
2795 }
2796 
2797 int
2798 sys_posix_spawn(struct lwp *l1, const struct sys_posix_spawn_args *uap,
2799     register_t *retval)
2800 {
2801 	/* {
2802 		syscallarg(pid_t *) pid;
2803 		syscallarg(const char *) path;
2804 		syscallarg(const struct posix_spawn_file_actions *) file_actions;
2805 		syscallarg(const struct posix_spawnattr *) attrp;
2806 		syscallarg(char *const *) argv;
2807 		syscallarg(char *const *) envp;
2808 	} */
2809 
2810 	int error;
2811 	struct posix_spawn_file_actions *fa = NULL;
2812 	struct posix_spawnattr *sa = NULL;
2813 	pid_t pid;
2814 	bool child_ok = false;
2815 	rlim_t max_fileactions;
2816 	proc_t *p = l1->l_proc;
2817 
2818 	/* check_posix_spawn() increments nprocs for us. */
2819 	error = check_posix_spawn(l1);
2820 	if (error) {
2821 		*retval = error;
2822 		return 0;
2823 	}
2824 
2825 	/* copy in file_actions struct */
2826 	if (SCARG(uap, file_actions) != NULL) {
2827 		max_fileactions = 2 * uimin(p->p_rlimit[RLIMIT_NOFILE].rlim_cur,
2828 		    maxfiles);
2829 		error = posix_spawn_fa_alloc(&fa, SCARG(uap, file_actions),
2830 		    max_fileactions);
2831 		if (error)
2832 			goto error_exit;
2833 	}
2834 
2835 	/* copyin posix_spawnattr struct */
2836 	if (SCARG(uap, attrp) != NULL) {
2837 		sa = kmem_alloc(sizeof(*sa), KM_SLEEP);
2838 		error = copyin(SCARG(uap, attrp), sa, sizeof(*sa));
2839 		if (error)
2840 			goto error_exit;
2841 	}
2842 
2843 	/*
2844 	 * Do the spawn
2845 	 */
2846 	error = do_posix_spawn(l1, &pid, &child_ok, SCARG(uap, path), fa, sa,
2847 	    SCARG(uap, argv), SCARG(uap, envp), execve_fetch_element);
2848 	if (error)
2849 		goto error_exit;
2850 
2851 	if (error == 0 && SCARG(uap, pid) != NULL)
2852 		error = copyout(&pid, SCARG(uap, pid), sizeof(pid));
2853 
2854 	*retval = error;
2855 	return 0;
2856 
2857  error_exit:
2858 	if (!child_ok) {
2859 		(void)chgproccnt(kauth_cred_getuid(l1->l_cred), -1);
2860 		atomic_dec_uint(&nprocs);
2861 
2862 		if (sa)
2863 			kmem_free(sa, sizeof(*sa));
2864 		if (fa)
2865 			posix_spawn_fa_free(fa, fa->len);
2866 	}
2867 
2868 	*retval = error;
2869 	return 0;
2870 }
2871 
2872 void
2873 exec_free_emul_arg(struct exec_package *epp)
2874 {
2875 	if (epp->ep_emul_arg_free != NULL) {
2876 		KASSERT(epp->ep_emul_arg != NULL);
2877 		(*epp->ep_emul_arg_free)(epp->ep_emul_arg);
2878 		epp->ep_emul_arg_free = NULL;
2879 		epp->ep_emul_arg = NULL;
2880 	} else {
2881 		KASSERT(epp->ep_emul_arg == NULL);
2882 	}
2883 }
2884 
2885 #ifdef DEBUG_EXEC
2886 static void
2887 dump_vmcmds(const struct exec_package * const epp, size_t x, int error)
2888 {
2889 	struct exec_vmcmd *vp = &epp->ep_vmcmds.evs_cmds[0];
2890 	size_t j;
2891 
2892 	if (error == 0)
2893 		DPRINTF(("vmcmds %u\n", epp->ep_vmcmds.evs_used));
2894 	else
2895 		DPRINTF(("vmcmds %zu/%u, error %d\n", x,
2896 		    epp->ep_vmcmds.evs_used, error));
2897 
2898 	for (j = 0; j < epp->ep_vmcmds.evs_used; j++) {
2899 		DPRINTF(("vmcmd[%zu] = vmcmd_map_%s %#"
2900 		    PRIxVADDR"/%#"PRIxVSIZE" fd@%#"
2901 		    PRIxVSIZE" prot=0%o flags=%d\n", j,
2902 		    vp[j].ev_proc == vmcmd_map_pagedvn ?
2903 		    "pagedvn" :
2904 		    vp[j].ev_proc == vmcmd_map_readvn ?
2905 		    "readvn" :
2906 		    vp[j].ev_proc == vmcmd_map_zero ?
2907 		    "zero" : "*unknown*",
2908 		    vp[j].ev_addr, vp[j].ev_len,
2909 		    vp[j].ev_offset, vp[j].ev_prot,
2910 		    vp[j].ev_flags));
2911 		if (error != 0 && j == x)
2912 			DPRINTF(("     ^--- failed\n"));
2913 	}
2914 }
2915 #endif
2916