xref: /dflybsd-src/sys/kern/kern_fork.c (revision a05b5f9b08cd36ac2df5871787af502f89b049b0)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)kern_fork.c	8.6 (Berkeley) 4/8/94
39  * $FreeBSD: src/sys/kern/kern_fork.c,v 1.72.2.14 2003/06/26 04:15:10 silby Exp $
40  * $DragonFly: src/sys/kern/kern_fork.c,v 1.57 2006/09/19 11:47:35 corecode Exp $
41  */
42 
43 #include "opt_ktrace.h"
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/sysproto.h>
48 #include <sys/filedesc.h>
49 #include <sys/kernel.h>
50 #include <sys/sysctl.h>
51 #include <sys/malloc.h>
52 #include <sys/proc.h>
53 #include <sys/resourcevar.h>
54 #include <sys/vnode.h>
55 #include <sys/acct.h>
56 #include <sys/ktrace.h>
57 #include <sys/unistd.h>
58 #include <sys/jail.h>
59 #include <sys/caps.h>
60 
61 #include <vm/vm.h>
62 #include <sys/lock.h>
63 #include <vm/pmap.h>
64 #include <vm/vm_map.h>
65 #include <vm/vm_extern.h>
66 #include <vm/vm_zone.h>
67 
68 #include <sys/vmmeter.h>
69 #include <sys/user.h>
70 #include <sys/thread2.h>
71 
72 static MALLOC_DEFINE(M_ATFORK, "atfork", "atfork callback");
73 
74 /*
75  * These are the stuctures used to create a callout list for things to do
76  * when forking a process
77  */
78 struct forklist {
79 	forklist_fn function;
80 	TAILQ_ENTRY(forklist) next;
81 };
82 
83 TAILQ_HEAD(forklist_head, forklist);
84 static struct forklist_head fork_list = TAILQ_HEAD_INITIALIZER(fork_list);
85 
86 int forksleep; /* Place for fork1() to sleep on. */
87 
88 /* ARGSUSED */
89 int
90 sys_fork(struct fork_args *uap)
91 {
92 	struct lwp *lp = curthread->td_lwp;
93 	struct proc *p2;
94 	int error;
95 
96 	error = fork1(lp, RFFDG | RFPROC, &p2);
97 	if (error == 0) {
98 		start_forked_proc(lp, p2);
99 		uap->sysmsg_fds[0] = p2->p_pid;
100 		uap->sysmsg_fds[1] = 0;
101 	}
102 	return error;
103 }
104 
105 /* ARGSUSED */
106 int
107 sys_vfork(struct vfork_args *uap)
108 {
109 	struct lwp *lp = curthread->td_lwp;
110 	struct proc *p2;
111 	int error;
112 
113 	error = fork1(lp, RFFDG | RFPROC | RFPPWAIT | RFMEM, &p2);
114 	if (error == 0) {
115 		start_forked_proc(lp, p2);
116 		uap->sysmsg_fds[0] = p2->p_pid;
117 		uap->sysmsg_fds[1] = 0;
118 	}
119 	return error;
120 }
121 
122 /*
123  * Handle rforks.  An rfork may (1) operate on the current process without
124  * creating a new, (2) create a new process that shared the current process's
125  * vmspace, signals, and/or descriptors, or (3) create a new process that does
126  * not share these things (normal fork).
127  *
128  * Note that we only call start_forked_proc() if a new process is actually
129  * created.
130  *
131  * rfork { int flags }
132  */
133 int
134 sys_rfork(struct rfork_args *uap)
135 {
136 	struct lwp *lp = curthread->td_lwp;
137 	struct proc *p2;
138 	int error;
139 
140 	if ((uap->flags & RFKERNELONLY) != 0)
141 		return (EINVAL);
142 
143 	error = fork1(lp, uap->flags, &p2);
144 	if (error == 0) {
145 		if (p2)
146 			start_forked_proc(lp, p2);
147 		uap->sysmsg_fds[0] = p2 ? p2->p_pid : 0;
148 		uap->sysmsg_fds[1] = 0;
149 	}
150 	return error;
151 }
152 
153 
154 int	nprocs = 1;		/* process 0 */
155 
156 int
157 fork1(struct lwp *lp1, int flags, struct proc **procp)
158 {
159 	struct proc *p1 = lp1->lwp_proc;
160 	struct proc *p2, *pptr;
161 	struct lwp *lp2;
162 	uid_t uid;
163 	int ok;
164 	static int curfail = 0;
165 	static struct timeval lastfail;
166 	struct forklist *ep;
167 	struct filedesc_to_leader *fdtol;
168 
169 	if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
170 		return (EINVAL);
171 
172 	/*
173 	 * Here we don't create a new process, but we divorce
174 	 * certain parts of a process from itself.
175 	 */
176 	if ((flags & RFPROC) == 0) {
177 
178 		vm_fork(lp1, 0, flags);
179 
180 		/*
181 		 * Close all file descriptors.
182 		 */
183 		if (flags & RFCFDG) {
184 			struct filedesc *fdtmp;
185 			fdtmp = fdinit(p1);
186 			fdfree(p1);
187 			p1->p_fd = fdtmp;
188 		}
189 
190 		/*
191 		 * Unshare file descriptors (from parent.)
192 		 */
193 		if (flags & RFFDG) {
194 			if (p1->p_fd->fd_refcnt > 1) {
195 				struct filedesc *newfd;
196 				newfd = fdcopy(p1);
197 				fdfree(p1);
198 				p1->p_fd = newfd;
199 			}
200 		}
201 		*procp = NULL;
202 		return (0);
203 	}
204 
205 	/*
206 	 * Although process entries are dynamically created, we still keep
207 	 * a global limit on the maximum number we will create.  Don't allow
208 	 * a nonprivileged user to use the last ten processes; don't let root
209 	 * exceed the limit. The variable nprocs is the current number of
210 	 * processes, maxproc is the limit.
211 	 */
212 	uid = p1->p_ucred->cr_ruid;
213 	if ((nprocs >= maxproc - 10 && uid != 0) || nprocs >= maxproc) {
214 		if (ppsratecheck(&lastfail, &curfail, 1))
215 			printf("maxproc limit exceeded by uid %d, please "
216 			       "see tuning(7) and login.conf(5).\n", uid);
217 		tsleep(&forksleep, 0, "fork", hz / 2);
218 		return (EAGAIN);
219 	}
220 	/*
221 	 * Increment the nprocs resource before blocking can occur.  There
222 	 * are hard-limits as to the number of processes that can run.
223 	 */
224 	nprocs++;
225 
226 	/*
227 	 * Increment the count of procs running with this uid. Don't allow
228 	 * a nonprivileged user to exceed their current limit.
229 	 */
230 	ok = chgproccnt(p1->p_ucred->cr_ruidinfo, 1,
231 		(uid != 0) ? p1->p_rlimit[RLIMIT_NPROC].rlim_cur : 0);
232 	if (!ok) {
233 		/*
234 		 * Back out the process count
235 		 */
236 		nprocs--;
237 		if (ppsratecheck(&lastfail, &curfail, 1))
238 			printf("maxproc limit exceeded by uid %d, please "
239 			       "see tuning(7) and login.conf(5).\n", uid);
240 		tsleep(&forksleep, 0, "fork", hz / 2);
241 		return (EAGAIN);
242 	}
243 
244 	/* Allocate new proc. */
245 	p2 = zalloc(proc_zone);
246 
247 	/*
248 	 * Setup linkage for kernel based threading XXX lwp
249 	 */
250 	if (flags & RFTHREAD) {
251 		p2->p_peers = p1->p_peers;
252 		p1->p_peers = p2;
253 		p2->p_leader = p1->p_leader;
254 	} else {
255 		p2->p_peers = NULL;
256 		p2->p_leader = p2;
257 	}
258 
259 	p2->p_wakeup = 0;
260 	p2->p_vmspace = NULL;
261 	p2->p_numposixlocks = 0;
262 	p2->p_emuldata = NULL;
263 	LIST_INIT(&p2->p_lwps);
264 
265 	/* XXX lwp */
266 	lp2 = &p2->p_lwp;
267 	lp2->lwp_proc = p2;
268 	lp2->lwp_tid = 0;
269 	LIST_INSERT_HEAD(&p2->p_lwps, lp2, lwp_list);
270 	p2->p_nthreads = 1;
271 	p2->p_nstopped = 0;
272 	p2->p_lasttid = 0;
273 
274 	/*
275 	 * Setting the state to SIDL protects the partially initialized
276 	 * process once it starts getting hooked into the rest of the system.
277 	 */
278 	p2->p_stat = SIDL;
279 	proc_add_allproc(p2);
280 
281 	/*
282 	 * Make a proc table entry for the new process.
283 	 * Start by zeroing the section of proc that is zero-initialized,
284 	 * then copy the section that is copied directly from the parent.
285 	 */
286 	bzero(&p2->p_startzero,
287 	    (unsigned) ((caddr_t)&p2->p_endzero - (caddr_t)&p2->p_startzero));
288 	bzero(&lp2->lwp_startzero,
289 	    (unsigned) ((caddr_t)&lp2->lwp_endzero -
290 			(caddr_t)&lp2->lwp_startzero));
291 	bcopy(&p1->p_startcopy, &p2->p_startcopy,
292 	    (unsigned) ((caddr_t)&p2->p_endcopy - (caddr_t)&p2->p_startcopy));
293 	bcopy(&p1->p_lwp.lwp_startcopy, &lp2->lwp_startcopy,
294 	    (unsigned) ((caddr_t)&lp2->lwp_endcopy -
295 			(caddr_t)&lp2->lwp_startcopy));
296 
297 	p2->p_aioinfo = NULL;
298 
299 	/*
300 	 * Duplicate sub-structures as needed.
301 	 * Increase reference counts on shared objects.
302 	 * The p_stats and p_sigacts substructs are set in vm_fork.
303 	 * p_lock is in the copy area and must be cleared.
304 	 */
305 	p2->p_flag = 0;
306 	p2->p_lock = 0;
307 	if (p1->p_flag & P_PROFIL)
308 		startprofclock(p2);
309 	p2->p_ucred = crhold(p1->p_ucred);
310 
311 	if (jailed(p2->p_ucred))
312 		p2->p_flag |= P_JAILED;
313 
314 	if (p2->p_args)
315 		p2->p_args->ar_ref++;
316 
317 	if (flags & RFSIGSHARE) {
318 		p2->p_procsig = p1->p_procsig;
319 		p2->p_procsig->ps_refcnt++;
320 		if (p1->p_sigacts == &p1->p_addr->u_sigacts) {
321 			struct sigacts *newsigacts;
322 
323 			/* Create the shared sigacts structure */
324 			MALLOC(newsigacts, struct sigacts *,
325 			    sizeof(struct sigacts), M_SUBPROC, M_WAITOK);
326 			crit_enter();
327 			/*
328 			 * Set p_sigacts to the new shared structure.
329 			 * Note that this is updating p1->p_sigacts at the
330 			 * same time, since p_sigacts is just a pointer to
331 			 * the shared p_procsig->ps_sigacts.
332 			 */
333 			p2->p_sigacts  = newsigacts;
334 			bcopy(&p1->p_addr->u_sigacts, p2->p_sigacts,
335 			    sizeof(*p2->p_sigacts));
336 			*p2->p_sigacts = p1->p_addr->u_sigacts;
337 			crit_exit();
338 		}
339 	} else {
340 		MALLOC(p2->p_procsig, struct procsig *, sizeof(struct procsig),
341 		    M_SUBPROC, M_WAITOK);
342 		bcopy(p1->p_procsig, p2->p_procsig, sizeof(*p2->p_procsig));
343 		p2->p_procsig->ps_refcnt = 1;
344 		p2->p_sigacts = NULL;	/* finished in vm_fork() */
345 	}
346 	if (flags & RFLINUXTHPN)
347 	        p2->p_sigparent = SIGUSR1;
348 	else
349 	        p2->p_sigparent = SIGCHLD;
350 
351 	/* bump references to the text vnode (for procfs) */
352 	p2->p_textvp = p1->p_textvp;
353 	if (p2->p_textvp)
354 		vref(p2->p_textvp);
355 
356 	/*
357 	 * Handle file descriptors
358 	 */
359 	if (flags & RFCFDG) {
360 		p2->p_fd = fdinit(p1);
361 		fdtol = NULL;
362 	} else if (flags & RFFDG) {
363 		p2->p_fd = fdcopy(p1);
364 		fdtol = NULL;
365 	} else {
366 		p2->p_fd = fdshare(p1);
367 		if (p1->p_fdtol == NULL)
368 			p1->p_fdtol =
369 				filedesc_to_leader_alloc(NULL,
370 							 p1->p_leader);
371 		if ((flags & RFTHREAD) != 0) {
372 			/*
373 			 * Shared file descriptor table and
374 			 * shared process leaders.
375 			 */
376 			fdtol = p1->p_fdtol;
377 			fdtol->fdl_refcount++;
378 		} else {
379 			/*
380 			 * Shared file descriptor table, and
381 			 * different process leaders
382 			 */
383 			fdtol = filedesc_to_leader_alloc(p1->p_fdtol, p2);
384 		}
385 	}
386 	p2->p_fdtol = fdtol;
387 	p2->p_limit = plimit_fork(p1->p_limit);
388 
389 	/*
390 	 * Preserve some more flags in subprocess.  P_PROFIL has already
391 	 * been preserved.
392 	 */
393 	p2->p_flag |= p1->p_flag & (P_SUGID | P_ALTSTACK);
394 	if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
395 		p2->p_flag |= P_CONTROLT;
396 	if (flags & RFPPWAIT)
397 		p2->p_flag |= P_PPWAIT;
398 
399 	/*
400 	 * Inherit the virtual kernel structure (allows a virtual kernel
401 	 * to fork to simulate multiple cpus).
402 	 */
403 	if ((p2->p_vkernel = p1->p_vkernel) != NULL)
404 		vkernel_hold(p2->p_vkernel);
405 
406 	/*
407 	 * Once we are on a pglist we may receive signals.  XXX we might
408 	 * race a ^C being sent to the process group by not receiving it
409 	 * at all prior to this line.
410 	 */
411 	LIST_INSERT_AFTER(p1, p2, p_pglist);
412 
413 	/*
414 	 * Attach the new process to its parent.
415 	 *
416 	 * If RFNOWAIT is set, the newly created process becomes a child
417 	 * of init.  This effectively disassociates the child from the
418 	 * parent.
419 	 */
420 	if (flags & RFNOWAIT)
421 		pptr = initproc;
422 	else
423 		pptr = p1;
424 	p2->p_pptr = pptr;
425 	LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
426 	LIST_INIT(&p2->p_children);
427 	varsymset_init(&p2->p_varsymset, &p1->p_varsymset);
428 	callout_init(&p2->p_ithandle);
429 
430 #ifdef KTRACE
431 	/*
432 	 * Copy traceflag and tracefile if enabled.  If not inherited,
433 	 * these were zeroed above but we still could have a trace race
434 	 * so make sure p2's p_tracenode is NULL.
435 	 */
436 	if ((p1->p_traceflag & KTRFAC_INHERIT) && p2->p_tracenode == NULL) {
437 		p2->p_traceflag = p1->p_traceflag;
438 		p2->p_tracenode = ktrinherit(p1->p_tracenode);
439 	}
440 #endif
441 
442 	/*
443 	 * Inherit the scheduler and initialize scheduler-related fields.
444 	 * Set cpbase to the last timeout that occured (not the upcoming
445 	 * timeout).
446 	 *
447 	 * A critical section is required since a timer IPI can update
448 	 * scheduler specific data.
449 	 */
450 	crit_enter();
451 	p2->p_usched = p1->p_usched;
452 	lp2->lwp_cpbase = mycpu->gd_schedclock.time -
453 			mycpu->gd_schedclock.periodic;
454 	p2->p_usched->heuristic_forking(&p1->p_lwp, lp2);
455 	crit_exit();
456 
457 	/*
458 	 * This begins the section where we must prevent the parent
459 	 * from being swapped.
460 	 */
461 	PHOLD(p1);
462 
463 	/*
464 	 * Finish creating the child process.  It will return via a different
465 	 * execution path later.  (ie: directly into user mode)
466 	 */
467 	vm_fork(lp1, p2, flags);
468 	caps_fork(p1, p2, flags);
469 
470 	if (flags == (RFFDG | RFPROC)) {
471 		mycpu->gd_cnt.v_forks++;
472 		mycpu->gd_cnt.v_forkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
473 	} else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) {
474 		mycpu->gd_cnt.v_vforks++;
475 		mycpu->gd_cnt.v_vforkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
476 	} else if (p1 == &proc0) {
477 		mycpu->gd_cnt.v_kthreads++;
478 		mycpu->gd_cnt.v_kthreadpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
479 	} else {
480 		mycpu->gd_cnt.v_rforks++;
481 		mycpu->gd_cnt.v_rforkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
482 	}
483 
484 	/*
485 	 * Both processes are set up, now check if any loadable modules want
486 	 * to adjust anything.
487 	 *   What if they have an error? XXX
488 	 */
489 	TAILQ_FOREACH(ep, &fork_list, next) {
490 		(*ep->function)(p1, p2, flags);
491 	}
492 
493 	/*
494 	 * Set the start time.  Note that the process is not runnable.  The
495 	 * caller is responsible for making it runnable.
496 	 */
497 	microtime(&p2->p_start);
498 	p2->p_acflag = AFORK;
499 
500 	/*
501 	 * tell any interested parties about the new process
502 	 */
503 	KNOTE(&p1->p_klist, NOTE_FORK | p2->p_pid);
504 
505 	/*
506 	 * Return child proc pointer to parent.
507 	 */
508 	*procp = p2;
509 	return (0);
510 }
511 
512 /*
513  * The next two functionms are general routines to handle adding/deleting
514  * items on the fork callout list.
515  *
516  * at_fork():
517  * Take the arguments given and put them onto the fork callout list,
518  * However first make sure that it's not already there.
519  * Returns 0 on success or a standard error number.
520  */
521 int
522 at_fork(forklist_fn function)
523 {
524 	struct forklist *ep;
525 
526 #ifdef INVARIANTS
527 	/* let the programmer know if he's been stupid */
528 	if (rm_at_fork(function)) {
529 		printf("WARNING: fork callout entry (%p) already present\n",
530 		    function);
531 	}
532 #endif
533 	ep = kmalloc(sizeof(*ep), M_ATFORK, M_WAITOK|M_ZERO);
534 	ep->function = function;
535 	TAILQ_INSERT_TAIL(&fork_list, ep, next);
536 	return (0);
537 }
538 
539 /*
540  * Scan the exit callout list for the given item and remove it..
541  * Returns the number of items removed (0 or 1)
542  */
543 int
544 rm_at_fork(forklist_fn function)
545 {
546 	struct forklist *ep;
547 
548 	TAILQ_FOREACH(ep, &fork_list, next) {
549 		if (ep->function == function) {
550 			TAILQ_REMOVE(&fork_list, ep, next);
551 			kfree(ep, M_ATFORK);
552 			return(1);
553 		}
554 	}
555 	return (0);
556 }
557 
558 /*
559  * Add a forked process to the run queue after any remaining setup, such
560  * as setting the fork handler, has been completed.
561  */
562 void
563 start_forked_proc(struct lwp *lp1, struct proc *p2)
564 {
565 	struct lwp *lp2;
566 
567 	KKASSERT(p2 != NULL && p2->p_nthreads == 1);
568 
569 	lp2 = LIST_FIRST(&p2->p_lwps);
570 
571 	/*
572 	 * Move from SIDL to RUN queue, and activate the process's thread.
573 	 * Activation of the thread effectively makes the process "a"
574 	 * current process, so we do not setrunqueue().
575 	 *
576 	 * YYY setrunqueue works here but we should clean up the trampoline
577 	 * code so we just schedule the LWKT thread and let the trampoline
578 	 * deal with the userland scheduler on return to userland.
579 	 */
580 	KASSERT(p2->p_stat == SIDL,
581 	    ("cannot start forked process, bad status: %p", p2));
582 	p2->p_usched->resetpriority(lp2);
583 	crit_enter();
584 	p2->p_stat = SRUN;
585 	p2->p_usched->setrunqueue(lp2);
586 	crit_exit();
587 
588 	/*
589 	 * Now can be swapped.
590 	 */
591 	PRELE(lp1->lwp_proc);
592 
593 	/*
594 	 * Preserve synchronization semantics of vfork.  If waiting for
595 	 * child to exec or exit, set P_PPWAIT on child, and sleep on our
596 	 * proc (in case of exit).
597 	 */
598 	while (p2->p_flag & P_PPWAIT)
599 		tsleep(lp1->lwp_proc, 0, "ppwait", 0);
600 }
601