1*bcfabd50Sandvar /* $NetBSD: kern_lwp.c,v 1.269 2023/12/20 21:03:50 andvar Exp $ */
2e0d8d366Sthorpej
3e0d8d366Sthorpej /*-
459e0001fSad * Copyright (c) 2001, 2006, 2007, 2008, 2009, 2019, 2020, 2023
520f33b02Sad * The NetBSD Foundation, Inc.
6e0d8d366Sthorpej * All rights reserved.
7e0d8d366Sthorpej *
8e0d8d366Sthorpej * This code is derived from software contributed to The NetBSD Foundation
9b07ec3fcSad * by Nathan J. Williams, and Andrew Doran.
10e0d8d366Sthorpej *
11e0d8d366Sthorpej * Redistribution and use in source and binary forms, with or without
12e0d8d366Sthorpej * modification, are permitted provided that the following conditions
13e0d8d366Sthorpej * are met:
14e0d8d366Sthorpej * 1. Redistributions of source code must retain the above copyright
15e0d8d366Sthorpej * notice, this list of conditions and the following disclaimer.
16e0d8d366Sthorpej * 2. Redistributions in binary form must reproduce the above copyright
17e0d8d366Sthorpej * notice, this list of conditions and the following disclaimer in the
18e0d8d366Sthorpej * documentation and/or other materials provided with the distribution.
19e0d8d366Sthorpej *
20e0d8d366Sthorpej * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21e0d8d366Sthorpej * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22e0d8d366Sthorpej * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23e0d8d366Sthorpej * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24e0d8d366Sthorpej * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25e0d8d366Sthorpej * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26e0d8d366Sthorpej * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27e0d8d366Sthorpej * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28e0d8d366Sthorpej * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29e0d8d366Sthorpej * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30e0d8d366Sthorpej * POSSIBILITY OF SUCH DAMAGE.
31e0d8d366Sthorpej */
32e0d8d366Sthorpej
33b07ec3fcSad /*
34b07ec3fcSad * Overview
35b07ec3fcSad *
3646022e56Sad * Lightweight processes (LWPs) are the basic unit or thread of
37b07ec3fcSad * execution within the kernel. The core state of an LWP is described
3846022e56Sad * by "struct lwp", also known as lwp_t.
39b07ec3fcSad *
40b07ec3fcSad * Each LWP is contained within a process (described by "struct proc"),
41b07ec3fcSad * Every process contains at least one LWP, but may contain more. The
42b07ec3fcSad * process describes attributes shared among all of its LWPs such as a
43b07ec3fcSad * private address space, global execution state (stopped, active,
44b07ec3fcSad * zombie, ...), signal disposition and so on. On a multiprocessor
4546022e56Sad * machine, multiple LWPs be executing concurrently in the kernel.
46b07ec3fcSad *
47b07ec3fcSad * Execution states
48b07ec3fcSad *
49b07ec3fcSad * At any given time, an LWP has overall state that is described by
50b07ec3fcSad * lwp::l_stat. The states are broken into two sets below. The first
51b07ec3fcSad * set is guaranteed to represent the absolute, current state of the
52b07ec3fcSad * LWP:
53b07ec3fcSad *
54b07ec3fcSad * LSONPROC
55b07ec3fcSad *
56b07ec3fcSad * On processor: the LWP is executing on a CPU, either in the
57b07ec3fcSad * kernel or in user space.
58b07ec3fcSad *
59b07ec3fcSad * LSRUN
60b07ec3fcSad *
61b07ec3fcSad * Runnable: the LWP is parked on a run queue, and may soon be
625c0e3318Srmind * chosen to run by an idle processor, or by a processor that
634b34a918Sandvar * has been asked to preempt a currently running but lower
6440cf6f36Srmind * priority LWP.
65b07ec3fcSad *
66b07ec3fcSad * LSIDL
67b07ec3fcSad *
6820180cb1Sad * Idle: the LWP has been created but has not yet executed, or
6920180cb1Sad * it has ceased executing a unit of work and is waiting to be
7020180cb1Sad * started again. This state exists so that the LWP can occupy
7120180cb1Sad * a slot in the process & PID table, but without having to
7220180cb1Sad * worry about being touched; lookups of the LWP by ID will
7320180cb1Sad * fail while in this state. The LWP will become visible for
7420180cb1Sad * lookup once its state transitions further. Some special
7520180cb1Sad * kernel threads also (ab)use this state to indicate that they
7620180cb1Sad * are idle (soft interrupts and idle LWPs).
77b07ec3fcSad *
78b07ec3fcSad * LSSUSPENDED:
79b07ec3fcSad *
80b07ec3fcSad * Suspended: the LWP has had its execution suspended by
81b07ec3fcSad * another LWP in the same process using the _lwp_suspend()
82b07ec3fcSad * system call. User-level LWPs also enter the suspended
83b07ec3fcSad * state when the system is shutting down.
84b07ec3fcSad *
85b07ec3fcSad * The second set represent a "statement of intent" on behalf of the
86b07ec3fcSad * LWP. The LWP may in fact be executing on a processor, may be
8746022e56Sad * sleeping or idle. It is expected to take the necessary action to
8846022e56Sad * stop executing or become "running" again within a short timeframe.
8982002773Sad * The LP_RUNNING flag in lwp::l_pflag indicates that an LWP is running.
905c0e3318Srmind * Importantly, it indicates that its state is tied to a CPU.
91b07ec3fcSad *
92b07ec3fcSad * LSZOMB:
93b07ec3fcSad *
9446022e56Sad * Dead or dying: the LWP has released most of its resources
95ddf65d89Sad * and is about to switch away into oblivion, or has already
9646022e56Sad * switched away. When it switches away, its few remaining
9746022e56Sad * resources can be collected.
98b07ec3fcSad *
99b07ec3fcSad * LSSLEEP:
100b07ec3fcSad *
101b07ec3fcSad * Sleeping: the LWP has entered itself onto a sleep queue, and
10246022e56Sad * has switched away or will switch away shortly to allow other
10346022e56Sad * LWPs to run on the CPU.
104b07ec3fcSad *
105b07ec3fcSad * LSSTOP:
106b07ec3fcSad *
107b07ec3fcSad * Stopped: the LWP has been stopped as a result of a job
108b07ec3fcSad * control signal, or as a result of the ptrace() interface.
10946022e56Sad *
110b07ec3fcSad * Stopped LWPs may run briefly within the kernel to handle
111b07ec3fcSad * signals that they receive, but will not return to user space
112b07ec3fcSad * until their process' state is changed away from stopped.
11346022e56Sad *
114b07ec3fcSad * Single LWPs within a process can not be set stopped
115b07ec3fcSad * selectively: all actions that can stop or continue LWPs
116b07ec3fcSad * occur at the process level.
117b07ec3fcSad *
118b07ec3fcSad * State transitions
119b07ec3fcSad *
12046022e56Sad * Note that the LSSTOP state may only be set when returning to
12146022e56Sad * user space in userret(), or when sleeping interruptably. The
12246022e56Sad * LSSUSPENDED state may only be set in userret(). Before setting
12346022e56Sad * those states, we try to ensure that the LWPs will release all
12446022e56Sad * locks that they hold, and at a minimum try to ensure that the
12546022e56Sad * LWP can be set runnable again by a signal.
126b07ec3fcSad *
127b07ec3fcSad * LWPs may transition states in the following ways:
128b07ec3fcSad *
129b07ec3fcSad * RUN -------> ONPROC ONPROC -----> RUN
130ddf65d89Sad * > SLEEP
131ddf65d89Sad * > STOPPED
132b07ec3fcSad * > SUSPENDED
133b07ec3fcSad * > ZOMB
134ddf65d89Sad * > IDL (special cases)
135b07ec3fcSad *
136b07ec3fcSad * STOPPED ---> RUN SUSPENDED --> RUN
137ddf65d89Sad * > SLEEP
138b07ec3fcSad *
139b07ec3fcSad * SLEEP -----> ONPROC IDL --------> RUN
140b07ec3fcSad * > RUN > SUSPENDED
141b07ec3fcSad * > STOPPED > STOPPED
142ddf65d89Sad * > ONPROC (special cases)
143b07ec3fcSad *
144ddf65d89Sad * Some state transitions are only possible with kernel threads (eg
145ddf65d89Sad * ONPROC -> IDL) and happen under tightly controlled circumstances
146ddf65d89Sad * free of unwanted side effects.
14746022e56Sad *
14829170d38Srmind * Migration
14929170d38Srmind *
15029170d38Srmind * Migration of threads from one CPU to another could be performed
15129170d38Srmind * internally by the scheduler via sched_takecpu() or sched_catchlwp()
15229170d38Srmind * functions. The universal lwp_migrate() function should be used for
15329170d38Srmind * any other cases. Subsystems in the kernel must be aware that CPU
15429170d38Srmind * of LWP may change, while it is not locked.
15529170d38Srmind *
156b07ec3fcSad * Locking
157b07ec3fcSad *
158b07ec3fcSad * The majority of fields in 'struct lwp' are covered by a single,
15946022e56Sad * general spin lock pointed to by lwp::l_mutex. The locks covering
160b07ec3fcSad * each field are documented in sys/lwp.h.
161b07ec3fcSad *
16246022e56Sad * State transitions must be made with the LWP's general lock held,
16346022e56Sad * and may cause the LWP's lock pointer to change. Manipulation of
16446022e56Sad * the general lock is not performed directly, but through calls to
16511a35aedSrmind * lwp_lock(), lwp_unlock() and others. It should be noted that the
16611a35aedSrmind * adaptive locks are not allowed to be released while the LWP's lock
16711a35aedSrmind * is being held (unlike for other spin-locks).
168b07ec3fcSad *
169b07ec3fcSad * States and their associated locks:
170b07ec3fcSad *
17111ba4e18Sad * LSIDL, LSONPROC, LSZOMB, LSSUPENDED:
172b07ec3fcSad *
17311ba4e18Sad * Always covered by spc_lwplock, which protects LWPs not
17411ba4e18Sad * associated with any other sync object. This is a per-CPU
17511ba4e18Sad * lock and matches lwp::l_cpu.
176b07ec3fcSad *
17711ba4e18Sad * LSRUN:
178b07ec3fcSad *
179f0301095Syamt * Always covered by spc_mutex, which protects the run queues.
180ddf65d89Sad * This is a per-CPU lock and matches lwp::l_cpu.
181b07ec3fcSad *
182b07ec3fcSad * LSSLEEP:
183b07ec3fcSad *
18411ba4e18Sad * Covered by a lock associated with the sleep queue (sometimes
185059ae07aSad * a turnstile sleep queue) that the LWP resides on. This can
186059ae07aSad * be spc_lwplock for SOBJ_SLEEPQ_NULL (an "untracked" sleep).
187b07ec3fcSad *
18811ba4e18Sad * LSSTOP:
189b07ec3fcSad *
190b07ec3fcSad * If the LWP was previously sleeping (l_wchan != NULL), then
19146022e56Sad * l_mutex references the sleep queue lock. If the LWP was
192b07ec3fcSad * runnable or on the CPU when halted, or has been removed from
19346022e56Sad * the sleep queue since halted, then the lock is spc_lwplock.
194b07ec3fcSad *
195b07ec3fcSad * The lock order is as follows:
196b07ec3fcSad *
19711ba4e18Sad * sleepq -> turnstile -> spc_lwplock -> spc_mutex
198b07ec3fcSad *
199118a4c19Sskrll * Each process has a scheduler state lock (proc::p_lock), and a
200b07ec3fcSad * number of counters on LWPs and their states: p_nzlwps, p_nrlwps, and
201b07ec3fcSad * so on. When an LWP is to be entered into or removed from one of the
202284c2b9aSad * following states, p_lock must be held and the process wide counters
203b07ec3fcSad * adjusted:
204b07ec3fcSad *
205b07ec3fcSad * LSIDL, LSZOMB, LSSTOP, LSSUSPENDED
206b07ec3fcSad *
207ddf65d89Sad * (But not always for kernel threads. There are some special cases
20811ba4e18Sad * as mentioned above: soft interrupts, and the idle loops.)
209ddf65d89Sad *
210b07ec3fcSad * Note that an LWP is considered running or likely to run soon if in
211b07ec3fcSad * one of the following states. This affects the value of p_nrlwps:
212b07ec3fcSad *
213b07ec3fcSad * LSRUN, LSONPROC, LSSLEEP
214b07ec3fcSad *
215284c2b9aSad * p_lock does not need to be held when transitioning among these
216ddf65d89Sad * three states, hence p_lock is rarely taken for state transitions.
217b07ec3fcSad */
218b07ec3fcSad
21909b31914Slukem #include <sys/cdefs.h>
220*bcfabd50Sandvar __KERNEL_RCSID(0, "$NetBSD: kern_lwp.c,v 1.269 2023/12/20 21:03:50 andvar Exp $");
22109b31914Slukem
2220c382013Syamt #include "opt_ddb.h"
223b07ec3fcSad #include "opt_lockdebug.h"
22437422f86Sdarran #include "opt_dtrace.h"
225d505b189Smartin
2268ea60a7fShannken #define _LWP_API_PRIVATE
2278ea60a7fShannken
228e0d8d366Sthorpej #include <sys/param.h>
229d31d6338Sriastradh
230e2aaefb8Sad #include <sys/atomic.h>
231d31d6338Sriastradh #include <sys/cprng.h>
232d31d6338Sriastradh #include <sys/cpu.h>
233d31d6338Sriastradh #include <sys/dtrace_bsd.h>
2343cb7a24bSad #include <sys/filedesc.h>
23572421a19Shannken #include <sys/fstrans.h>
236276ef223Sthorpej #include <sys/futex.h>
237d31d6338Sriastradh #include <sys/intr.h>
238d31d6338Sriastradh #include <sys/kauth.h>
239d31d6338Sriastradh #include <sys/kcov.h>
240d31d6338Sriastradh #include <sys/kmem.h>
241d31d6338Sriastradh #include <sys/lockdebug.h>
242d31d6338Sriastradh #include <sys/lwpctl.h>
243d31d6338Sriastradh #include <sys/msan.h>
244d31d6338Sriastradh #include <sys/pool.h>
245d31d6338Sriastradh #include <sys/proc.h>
246d31d6338Sriastradh #include <sys/pset.h>
247d31d6338Sriastradh #include <sys/psref.h>
248d31d6338Sriastradh #include <sys/ptrace.h>
249d31d6338Sriastradh #include <sys/sdt.h>
250d31d6338Sriastradh #include <sys/sleepq.h>
251fac91bbeSriastradh #include <sys/syncobj.h>
252d31d6338Sriastradh #include <sys/syscall_stats.h>
253d31d6338Sriastradh #include <sys/syscallargs.h>
254d31d6338Sriastradh #include <sys/sysctl.h>
255d31d6338Sriastradh #include <sys/systm.h>
256d31d6338Sriastradh #include <sys/uidinfo.h>
257d31d6338Sriastradh #include <sys/xcall.h>
2581bc28ea1Sdarran
259e0d8d366Sthorpej #include <uvm/uvm_extern.h>
2601587bff9Sskrll #include <uvm/uvm_object.h>
261e0d8d366Sthorpej
26211a35aedSrmind static pool_cache_t lwp_cache __read_mostly;
26311a35aedSrmind struct lwplist alllwp __cacheline_aligned;
26404e486d9Sthorpej
26520180cb1Sad static int lwp_ctor(void *, void *, int);
266ccfaf6e4Srmind static void lwp_dtor(void *, void *);
267ccfaf6e4Srmind
2686a9056a9Sdarran /* DTrace proc provider probes */
269db70f181Schristos SDT_PROVIDER_DEFINE(proc);
270db70f181Schristos
271db70f181Schristos SDT_PROBE_DEFINE1(proc, kernel, , lwp__create, "struct lwp *");
272db70f181Schristos SDT_PROBE_DEFINE1(proc, kernel, , lwp__start, "struct lwp *");
273db70f181Schristos SDT_PROBE_DEFINE1(proc, kernel, , lwp__exit, "struct lwp *");
2746a9056a9Sdarran
2757b708f2aSad struct turnstile turnstile0 __cacheline_aligned;
27651542301Spooka struct lwp lwp0 __aligned(MIN_LWP_ALIGNMENT) = {
27751542301Spooka #ifdef LWP0_CPU_INFO
27851542301Spooka .l_cpu = LWP0_CPU_INFO,
27951542301Spooka #endif
2807eef8264Smatt #ifdef LWP0_MD_INITIALIZER
2817eef8264Smatt .l_md = LWP0_MD_INITIALIZER,
2827eef8264Smatt #endif
28351542301Spooka .l_proc = &proc0,
28415689570Sthorpej .l_lid = 0, /* we own proc0's slot in the pid table */
28551542301Spooka .l_flag = LW_SYSTEM,
28651542301Spooka .l_stat = LSONPROC,
28751542301Spooka .l_ts = &turnstile0,
28851542301Spooka .l_syncobj = &sched_syncobj,
2896a317a61Sad .l_refcnt = 0,
29051542301Spooka .l_priority = PRI_USER + NPRI_USER - 1,
29151542301Spooka .l_inheritedprio = -1,
29251542301Spooka .l_class = SCHED_OTHER,
29351542301Spooka .l_psid = PS_NONE,
29451542301Spooka .l_pi_lenders = SLIST_HEAD_INITIALIZER(&lwp0.l_pi_lenders),
29551542301Spooka .l_name = __UNCONST("swapper"),
29651542301Spooka .l_fd = &filedesc0,
29751542301Spooka };
29851542301Spooka
29960441c6eSmrg static int
lwp_maxlwp(void)30060441c6eSmrg lwp_maxlwp(void)
30160441c6eSmrg {
30260441c6eSmrg /* Assume 1 LWP per 1MiB. */
30360441c6eSmrg uint64_t lwps_per = ctob(physmem) / (1024 * 1024);
30460441c6eSmrg
30560441c6eSmrg return MAX(MIN(MAXMAXLWP, lwps_per), MAXLWP);
30660441c6eSmrg }
30760441c6eSmrg
30804610895Schristos static int sysctl_kern_maxlwp(SYSCTLFN_PROTO);
30904610895Schristos
31004610895Schristos /*
31104610895Schristos * sysctl helper routine for kern.maxlwp. Ensures that the new
31204610895Schristos * values are not too low or too high.
31304610895Schristos */
31404610895Schristos static int
sysctl_kern_maxlwp(SYSCTLFN_ARGS)31504610895Schristos sysctl_kern_maxlwp(SYSCTLFN_ARGS)
31604610895Schristos {
31704610895Schristos int error, nmaxlwp;
31804610895Schristos struct sysctlnode node;
31904610895Schristos
32004610895Schristos nmaxlwp = maxlwp;
32104610895Schristos node = *rnode;
32204610895Schristos node.sysctl_data = &nmaxlwp;
32304610895Schristos error = sysctl_lookup(SYSCTLFN_CALL(&node));
32404610895Schristos if (error || newp == NULL)
32504610895Schristos return error;
32604610895Schristos
32760441c6eSmrg if (nmaxlwp < 0 || nmaxlwp >= MAXMAXLWP)
32804610895Schristos return EINVAL;
32960441c6eSmrg if (nmaxlwp > lwp_maxlwp())
33004610895Schristos return EINVAL;
33104610895Schristos maxlwp = nmaxlwp;
33204610895Schristos
33304610895Schristos return 0;
33404610895Schristos }
33504610895Schristos
33604610895Schristos static void
sysctl_kern_lwp_setup(void)33704610895Schristos sysctl_kern_lwp_setup(void)
33804610895Schristos {
3397040dfd0Smaxv sysctl_createv(NULL, 0, NULL, NULL,
34004610895Schristos CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
34104610895Schristos CTLTYPE_INT, "maxlwp",
34204610895Schristos SYSCTL_DESCR("Maximum number of simultaneous threads"),
34304610895Schristos sysctl_kern_maxlwp, 0, NULL, 0,
34404610895Schristos CTL_KERN, CTL_CREATE, CTL_EOL);
34504610895Schristos }
34604610895Schristos
34704e486d9Sthorpej void
lwpinit(void)34804e486d9Sthorpej lwpinit(void)
34904e486d9Sthorpej {
35004e486d9Sthorpej
35111a35aedSrmind LIST_INIT(&alllwp);
352290fe400Spooka lwpinit_specificdata();
3534b941db4Sthorpej /*
3544b941db4Sthorpej * Provide a barrier to ensure that all mutex_oncpu() and rw_oncpu()
3554b941db4Sthorpej * calls will exit before memory of LWPs is returned to the pool, where
3564b941db4Sthorpej * KVA of LWP structure might be freed and re-used for other purposes.
3574b941db4Sthorpej * Kernel preemption is disabled around mutex_oncpu() and rw_oncpu()
3584b941db4Sthorpej * callers, therefore a regular passive serialization barrier will
3594b941db4Sthorpej * do the job.
3604b941db4Sthorpej */
3614b941db4Sthorpej lwp_cache = pool_cache_init(sizeof(lwp_t), MIN_LWP_ALIGNMENT, 0,
3624b941db4Sthorpej PR_PSERIALIZE, "lwppl", NULL, IPL_NONE, lwp_ctor, lwp_dtor, NULL);
36304610895Schristos
36460441c6eSmrg maxlwp = lwp_maxlwp();
36504610895Schristos sysctl_kern_lwp_setup();
36604e486d9Sthorpej }
36704e486d9Sthorpej
36851542301Spooka void
lwp0_init(void)36951542301Spooka lwp0_init(void)
37051542301Spooka {
37151542301Spooka struct lwp *l = &lwp0;
37251542301Spooka
37351542301Spooka KASSERT((void *)uvm_lwp_getuarea(l) != NULL);
37451542301Spooka
37551542301Spooka LIST_INSERT_HEAD(&alllwp, l, l_list);
37651542301Spooka
37751542301Spooka callout_init(&l->l_timeout_ch, CALLOUT_MPSAFE);
37851542301Spooka callout_setfunc(&l->l_timeout_ch, sleepq_timeout, l);
37951542301Spooka cv_init(&l->l_sigcv, "sigwait");
3806d7c7959Srmind cv_init(&l->l_waitcv, "vfork");
38151542301Spooka
3820f335007Sad l->l_cred = kauth_cred_hold(proc0.p_cred);
38351542301Spooka
384c35e5be5Syamt kdtrace_thread_ctor(NULL, l);
38551542301Spooka lwp_initspecific(l);
38651542301Spooka
38751542301Spooka SYSCALL_TIME_LWP_INIT(l);
38851542301Spooka }
38951542301Spooka
39020180cb1Sad /*
39120180cb1Sad * Initialize the non-zeroed portion of an lwp_t.
39220180cb1Sad */
39320180cb1Sad static int
lwp_ctor(void * arg,void * obj,int flags)39420180cb1Sad lwp_ctor(void *arg, void *obj, int flags)
39520180cb1Sad {
39620180cb1Sad lwp_t *l = obj;
39720180cb1Sad
39820180cb1Sad l->l_stat = LSIDL;
39920180cb1Sad l->l_cpu = curcpu();
40020180cb1Sad l->l_mutex = l->l_cpu->ci_schedstate.spc_lwplock;
40159e0001fSad l->l_ts = kmem_alloc(sizeof(*l->l_ts), flags == PR_WAITOK ?
40259e0001fSad KM_SLEEP : KM_NOSLEEP);
40320180cb1Sad
40420180cb1Sad if (l->l_ts == NULL) {
40520180cb1Sad return ENOMEM;
40620180cb1Sad } else {
40720180cb1Sad turnstile_ctor(l->l_ts);
40820180cb1Sad return 0;
40920180cb1Sad }
41020180cb1Sad }
41120180cb1Sad
412ccfaf6e4Srmind static void
lwp_dtor(void * arg,void * obj)4132b1ed0f8Sthorpej lwp_dtor(void *arg, void *obj)
4142b1ed0f8Sthorpej {
4152b1ed0f8Sthorpej lwp_t *l = obj;
4162b1ed0f8Sthorpej
4172b1ed0f8Sthorpej /*
4182b1ed0f8Sthorpej * The value of l->l_cpu must still be valid at this point.
4192b1ed0f8Sthorpej */
4202b1ed0f8Sthorpej KASSERT(l->l_cpu != NULL);
42120180cb1Sad
42220180cb1Sad /*
42320180cb1Sad * We can't return turnstile0 to the pool (it didn't come from it),
42420180cb1Sad * so if it comes up just drop it quietly and move on.
42520180cb1Sad */
42620180cb1Sad if (l->l_ts != &turnstile0)
42759e0001fSad kmem_free(l->l_ts, sizeof(*l->l_ts));
428ccfaf6e4Srmind }
429ccfaf6e4Srmind
430e0d8d366Sthorpej /*
43120180cb1Sad * Set an LWP suspended.
432b07ec3fcSad *
433284c2b9aSad * Must be called with p_lock held, and the LWP locked. Will unlock the
434b07ec3fcSad * LWP before return.
435e0d8d366Sthorpej */
436b07ec3fcSad int
lwp_suspend(struct lwp * curl,struct lwp * t)437b07ec3fcSad lwp_suspend(struct lwp *curl, struct lwp *t)
438b07ec3fcSad {
439b07ec3fcSad int error;
440e0d8d366Sthorpej
441284c2b9aSad KASSERT(mutex_owned(t->l_proc->p_lock));
442fed17936Sad KASSERT(lwp_locked(t, NULL));
443b07ec3fcSad
444b07ec3fcSad KASSERT(curl != t || curl->l_stat == LSONPROC);
445b07ec3fcSad
446b07ec3fcSad /*
447b07ec3fcSad * If the current LWP has been told to exit, we must not suspend anyone
448b07ec3fcSad * else or deadlock could occur. We won't return to userspace.
449b07ec3fcSad */
4500fe9197cSrmind if ((curl->l_flag & (LW_WEXIT | LW_WCORE)) != 0) {
451b07ec3fcSad lwp_unlock(t);
452e0d8d366Sthorpej return (EDEADLK);
45354db0e51Smanu }
454e0d8d366Sthorpej
455a35a4fe3Skamil if ((t->l_flag & LW_DBGSUSPEND) != 0) {
456a35a4fe3Skamil lwp_unlock(t);
457a35a4fe3Skamil return 0;
458a35a4fe3Skamil }
459a35a4fe3Skamil
460b07ec3fcSad error = 0;
46154db0e51Smanu
462e0d8d366Sthorpej switch (t->l_stat) {
463e0d8d366Sthorpej case LSRUN:
464b07ec3fcSad case LSONPROC:
465934634a1Spavel t->l_flag |= LW_WSUSPEND;
466b07ec3fcSad lwp_need_userret(t);
467b07ec3fcSad lwp_unlock(t);
468e0d8d366Sthorpej break;
469b07ec3fcSad
470e0d8d366Sthorpej case LSSLEEP:
471934634a1Spavel t->l_flag |= LW_WSUSPEND;
472725adb2aSad lwp_need_userret(t);
473b07ec3fcSad
474b07ec3fcSad /*
475b07ec3fcSad * Kick the LWP and try to get it to the kernel boundary
476b07ec3fcSad * so that it will release any locks that it holds.
477b07ec3fcSad * setrunnable() will release the lock.
478b07ec3fcSad */
479934634a1Spavel if ((t->l_flag & LW_SINTR) != 0)
480b07ec3fcSad setrunnable(t);
481b07ec3fcSad else
482b07ec3fcSad lwp_unlock(t);
483e0d8d366Sthorpej break;
484b07ec3fcSad
485b07ec3fcSad case LSSUSPENDED:
486b07ec3fcSad lwp_unlock(t);
487b07ec3fcSad break;
488b07ec3fcSad
489b07ec3fcSad case LSSTOP:
490934634a1Spavel t->l_flag |= LW_WSUSPEND;
491725adb2aSad lwp_need_userret(t);
492b07ec3fcSad setrunnable(t);
493b07ec3fcSad break;
494b07ec3fcSad
495e0d8d366Sthorpej case LSIDL:
496e0d8d366Sthorpej case LSZOMB:
497b07ec3fcSad error = EINTR; /* It's what Solaris does..... */
498b07ec3fcSad lwp_unlock(t);
499e0d8d366Sthorpej break;
500e0d8d366Sthorpej }
501e0d8d366Sthorpej
502b07ec3fcSad return (error);
503e0d8d366Sthorpej }
504e0d8d366Sthorpej
505b07ec3fcSad /*
506b07ec3fcSad * Restart a suspended LWP.
507b07ec3fcSad *
508284c2b9aSad * Must be called with p_lock held, and the LWP locked. Will unlock the
509b07ec3fcSad * LWP before return.
510b07ec3fcSad */
511e0d8d366Sthorpej void
lwp_continue(struct lwp * l)512e0d8d366Sthorpej lwp_continue(struct lwp *l)
513e0d8d366Sthorpej {
514e0d8d366Sthorpej
515284c2b9aSad KASSERT(mutex_owned(l->l_proc->p_lock));
516fed17936Sad KASSERT(lwp_locked(l, NULL));
517b07ec3fcSad
518b07ec3fcSad /* If rebooting or not suspended, then just bail out. */
519934634a1Spavel if ((l->l_flag & LW_WREBOOT) != 0) {
520b07ec3fcSad lwp_unlock(l);
521e0d8d366Sthorpej return;
522b07ec3fcSad }
523e0d8d366Sthorpej
524934634a1Spavel l->l_flag &= ~LW_WSUSPEND;
525b07ec3fcSad
526a35a4fe3Skamil if (l->l_stat != LSSUSPENDED || (l->l_flag & LW_DBGSUSPEND) != 0) {
527b07ec3fcSad lwp_unlock(l);
528b07ec3fcSad return;
529b07ec3fcSad }
530b07ec3fcSad
531b07ec3fcSad /* setrunnable() will release the lock. */
532e0d8d366Sthorpej setrunnable(l);
533e0d8d366Sthorpej }
534e0d8d366Sthorpej
53579fccc75Smatt /*
536ca843a73Schristos * Restart a stopped LWP.
537ca843a73Schristos *
538ca843a73Schristos * Must be called with p_lock held, and the LWP NOT locked. Will unlock the
539ca843a73Schristos * LWP before return.
540ca843a73Schristos */
541ca843a73Schristos void
lwp_unstop(struct lwp * l)542ca843a73Schristos lwp_unstop(struct lwp *l)
543ca843a73Schristos {
544ca843a73Schristos struct proc *p = l->l_proc;
545ca843a73Schristos
5460eaaa024Sad KASSERT(mutex_owned(&proc_lock));
547ca843a73Schristos KASSERT(mutex_owned(p->p_lock));
548ca843a73Schristos
549ca843a73Schristos lwp_lock(l);
550ca843a73Schristos
551a35a4fe3Skamil KASSERT((l->l_flag & LW_DBGSUSPEND) == 0);
552a35a4fe3Skamil
553ca843a73Schristos /* If not stopped, then just bail out. */
554ca843a73Schristos if (l->l_stat != LSSTOP) {
555ca843a73Schristos lwp_unlock(l);
556ca843a73Schristos return;
557ca843a73Schristos }
558ca843a73Schristos
559ca843a73Schristos p->p_stat = SACTIVE;
560ca843a73Schristos p->p_sflag &= ~PS_STOPPING;
561ca843a73Schristos
562ca843a73Schristos if (!p->p_waited)
563ca843a73Schristos p->p_pptr->p_nstopchild--;
564ca843a73Schristos
565ca843a73Schristos if (l->l_wchan == NULL) {
566ca843a73Schristos /* setrunnable() will release the lock. */
567ca843a73Schristos setrunnable(l);
5684fbdf206Schristos } else if (p->p_xsig && (l->l_flag & LW_SINTR) != 0) {
5698cdc46a4Schristos /* setrunnable() so we can receive the signal */
5708cdc46a4Schristos setrunnable(l);
571ca843a73Schristos } else {
572ca843a73Schristos l->l_stat = LSSLEEP;
573ca843a73Schristos p->p_nrlwps++;
574ca843a73Schristos lwp_unlock(l);
575ca843a73Schristos }
576ca843a73Schristos }
577ca843a73Schristos
578ca843a73Schristos /*
579b07ec3fcSad * Wait for an LWP within the current process to exit. If 'lid' is
580b07ec3fcSad * non-zero, we are waiting for a specific LWP.
581b07ec3fcSad *
582284c2b9aSad * Must be called with p->p_lock held.
58379fccc75Smatt */
584e0d8d366Sthorpej int
lwp_wait(struct lwp * l,lwpid_t lid,lwpid_t * departed,bool exiting)585ea775f75Srmind lwp_wait(struct lwp *l, lwpid_t lid, lwpid_t *departed, bool exiting)
586e0d8d366Sthorpej {
587ea775f75Srmind const lwpid_t curlid = l->l_lid;
588ea775f75Srmind proc_t *p = l->l_proc;
589d1c42b4fSad lwp_t *l2, *next;
590ea775f75Srmind int error;
591e0d8d366Sthorpej
592284c2b9aSad KASSERT(mutex_owned(p->p_lock));
593b07ec3fcSad
594b07ec3fcSad p->p_nlwpwait++;
595fed17936Sad l->l_waitingfor = lid;
596b07ec3fcSad
597b07ec3fcSad for (;;) {
598ea775f75Srmind int nfound;
599ea775f75Srmind
600b07ec3fcSad /*
601b07ec3fcSad * Avoid a race between exit1() and sigexit(): if the
602b07ec3fcSad * process is dumping core, then we need to bail out: call
603b07ec3fcSad * into lwp_userret() where we will be suspended until the
604b07ec3fcSad * deed is done.
605b07ec3fcSad */
606b07ec3fcSad if ((p->p_sflag & PS_WCORE) != 0) {
607284c2b9aSad mutex_exit(p->p_lock);
608b07ec3fcSad lwp_userret(l);
609ea775f75Srmind KASSERT(false);
610b07ec3fcSad }
611b07ec3fcSad
612b07ec3fcSad /*
613b07ec3fcSad * First off, drain any detached LWP that is waiting to be
614b07ec3fcSad * reaped.
615b07ec3fcSad */
616ce3debcbSad if ((l2 = p->p_zomblwp) != NULL) {
617b07ec3fcSad p->p_zomblwp = NULL;
618fed17936Sad lwp_free(l2, false, false);/* releases proc mutex */
619284c2b9aSad mutex_enter(p->p_lock);
620ce3debcbSad continue;
621b07ec3fcSad }
622b07ec3fcSad
623b07ec3fcSad /*
624b07ec3fcSad * Now look for an LWP to collect. If the whole process is
625b07ec3fcSad * exiting, count detached LWPs as eligible to be collected,
626b07ec3fcSad * but don't drain them here.
627b07ec3fcSad */
628e0d8d366Sthorpej nfound = 0;
629fed17936Sad error = 0;
630d1c42b4fSad
631d1c42b4fSad /*
63220180cb1Sad * If given a specific LID, go via pid_table and make sure
633d1c42b4fSad * it's not detached.
634d1c42b4fSad */
635d1c42b4fSad if (lid != 0) {
63615689570Sthorpej l2 = proc_find_lwp(p, lid);
637d1c42b4fSad if (l2 == NULL) {
638d1c42b4fSad error = ESRCH;
639d1c42b4fSad break;
640d1c42b4fSad }
641d1c42b4fSad KASSERT(l2->l_lid == lid);
642d1c42b4fSad if ((l2->l_prflag & LPR_DETACHED) != 0) {
643d1c42b4fSad error = EINVAL;
644d1c42b4fSad break;
645d1c42b4fSad }
646d1c42b4fSad } else {
647d1c42b4fSad l2 = LIST_FIRST(&p->p_lwps);
648d1c42b4fSad }
649d1c42b4fSad for (; l2 != NULL; l2 = next) {
650d1c42b4fSad next = (lid != 0 ? NULL : LIST_NEXT(l2, l_sibling));
651d1c42b4fSad
652fed17936Sad /*
653fed17936Sad * If a specific wait and the target is waiting on
654fed17936Sad * us, then avoid deadlock. This also traps LWPs
655fed17936Sad * that try to wait on themselves.
656fed17936Sad *
657fed17936Sad * Note that this does not handle more complicated
658fed17936Sad * cycles, like: t1 -> t2 -> t3 -> t1. The process
659fed17936Sad * can still be killed so it is not a major problem.
660fed17936Sad */
661fed17936Sad if (l2->l_lid == lid && l2->l_waitingfor == curlid) {
662fed17936Sad error = EDEADLK;
663fed17936Sad break;
664fed17936Sad }
665fed17936Sad if (l2 == l)
666b07ec3fcSad continue;
667b07ec3fcSad if ((l2->l_prflag & LPR_DETACHED) != 0) {
668fed17936Sad nfound += exiting;
669fed17936Sad continue;
670fed17936Sad }
671fed17936Sad if (lid != 0) {
672fed17936Sad /*
673fed17936Sad * Mark this LWP as the first waiter, if there
674fed17936Sad * is no other.
675fed17936Sad */
676fed17936Sad if (l2->l_waiter == 0)
677fed17936Sad l2->l_waiter = curlid;
678fed17936Sad } else if (l2->l_waiter != 0) {
679fed17936Sad /*
680fed17936Sad * It already has a waiter - so don't
681fed17936Sad * collect it. If the waiter doesn't
682fed17936Sad * grab it we'll get another chance
683fed17936Sad * later.
684fed17936Sad */
685fed17936Sad nfound++;
686b07ec3fcSad continue;
687b07ec3fcSad }
688b07ec3fcSad nfound++;
689b07ec3fcSad
690b07ec3fcSad /* No need to lock the LWP in order to see LSZOMB. */
691b07ec3fcSad if (l2->l_stat != LSZOMB)
692e0d8d366Sthorpej continue;
693e0d8d366Sthorpej
694fed17936Sad /*
695fed17936Sad * We're no longer waiting. Reset the "first waiter"
696fed17936Sad * pointer on the target, in case it was us.
697fed17936Sad */
698fed17936Sad l->l_waitingfor = 0;
699fed17936Sad l2->l_waiter = 0;
700fed17936Sad p->p_nlwpwait--;
701e0d8d366Sthorpej if (departed)
702e0d8d366Sthorpej *departed = l2->l_lid;
703d831186dSad sched_lwp_collect(l2);
704fed17936Sad
705fed17936Sad /* lwp_free() releases the proc lock. */
706fed17936Sad lwp_free(l2, false, false);
707284c2b9aSad mutex_enter(p->p_lock);
708b07ec3fcSad return 0;
709b07ec3fcSad }
710e0d8d366Sthorpej
711fed17936Sad if (error != 0)
712fed17936Sad break;
713b07ec3fcSad if (nfound == 0) {
714b07ec3fcSad error = ESRCH;
715e0d8d366Sthorpej break;
716e0d8d366Sthorpej }
717fed17936Sad
718fed17936Sad /*
719ea775f75Srmind * Note: since the lock will be dropped, need to restart on
720ea775f75Srmind * wakeup to run all LWPs again, e.g. there may be new LWPs.
721fed17936Sad */
722fed17936Sad if (exiting) {
723b07ec3fcSad KASSERT(p->p_nlwps > 1);
7246c8b9827Sad error = cv_timedwait(&p->p_lwpcv, p->p_lock, 1);
725ea775f75Srmind break;
726b07ec3fcSad }
727fed17936Sad
728fed17936Sad /*
729bdfa5c00Sad * Break out if all LWPs are in _lwp_wait(). There are
730bdfa5c00Sad * other ways to hang the process with _lwp_wait(), but the
731bdfa5c00Sad * sleep is interruptable so little point checking for them.
732fed17936Sad */
733bdfa5c00Sad if (p->p_nlwpwait == p->p_nlwps) {
734b07ec3fcSad error = EDEADLK;
735b07ec3fcSad break;
736b07ec3fcSad }
737fed17936Sad
738fed17936Sad /*
739fed17936Sad * Sit around and wait for something to happen. We'll be
740fed17936Sad * awoken if any of the conditions examined change: if an
741fed17936Sad * LWP exits, is collected, or is detached.
742fed17936Sad */
743284c2b9aSad if ((error = cv_wait_sig(&p->p_lwpcv, p->p_lock)) != 0)
744b07ec3fcSad break;
745b07ec3fcSad }
746e0d8d366Sthorpej
747fed17936Sad /*
748fed17936Sad * We didn't find any LWPs to collect, we may have received a
749fed17936Sad * signal, or some other condition has caused us to bail out.
750fed17936Sad *
751fed17936Sad * If waiting on a specific LWP, clear the waiters marker: some
752fed17936Sad * other LWP may want it. Then, kick all the remaining waiters
753fed17936Sad * so that they can re-check for zombies and for deadlock.
754fed17936Sad */
755fed17936Sad if (lid != 0) {
75615689570Sthorpej l2 = proc_find_lwp(p, lid);
757d1c42b4fSad KASSERT(l2 == NULL || l2->l_lid == lid);
758d1c42b4fSad
759d1c42b4fSad if (l2 != NULL && l2->l_waiter == curlid)
760fed17936Sad l2->l_waiter = 0;
761fed17936Sad }
762b07ec3fcSad p->p_nlwpwait--;
763fed17936Sad l->l_waitingfor = 0;
764fed17936Sad cv_broadcast(&p->p_lwpcv);
765fed17936Sad
766b07ec3fcSad return error;
767b07ec3fcSad }
768b07ec3fcSad
769d1c42b4fSad /*
770b07ec3fcSad * Create a new LWP within process 'p2', using LWP 'l1' as a template.
771b07ec3fcSad * The new LWP is created in state LSIDL and must be set running,
772b07ec3fcSad * suspended, or stopped by the caller.
773e0d8d366Sthorpej */
774e0d8d366Sthorpej int
lwp_create(lwp_t * l1,proc_t * p2,vaddr_t uaddr,int flags,void * stack,size_t stacksize,void (* func)(void *),void * arg,lwp_t ** rnewlwpp,int sclass,const sigset_t * sigmask,const stack_t * sigstk)77540cf6f36Srmind lwp_create(lwp_t *l1, proc_t *p2, vaddr_t uaddr, int flags,
776d831186dSad void *stack, size_t stacksize, void (*func)(void *), void *arg,
777d7746f2eSchristos lwp_t **rnewlwpp, int sclass, const sigset_t *sigmask,
778d7746f2eSchristos const stack_t *sigstk)
779e0d8d366Sthorpej {
780c2427831Sad struct lwp *l2;
781e0d8d366Sthorpej
7824c7ba244Sad KASSERT(l1 == curlwp || l1->l_proc == &proc0);
7834c7ba244Sad
784b07ec3fcSad /*
785c2427831Sad * Enforce limits, excluding the first lwp and kthreads. We must
786c2427831Sad * use the process credentials here when adjusting the limit, as
787c2427831Sad * they are what's tied to the accounting entity. However for
788c2427831Sad * authorizing the action, we'll use the LWP's credentials.
78904610895Schristos */
790c2427831Sad mutex_enter(p2->p_lock);
79104610895Schristos if (p2->p_nlwps != 0 && p2 != &proc0) {
792c2427831Sad uid_t uid = kauth_cred_getuid(p2->p_cred);
79304610895Schristos int count = chglwpcnt(uid, 1);
79404610895Schristos if (__predict_false(count >
79504610895Schristos p2->p_rlimit[RLIMIT_NTHR].rlim_cur)) {
79604610895Schristos if (kauth_authorize_process(l1->l_cred,
79704610895Schristos KAUTH_PROCESS_RLIMIT, p2,
79804610895Schristos KAUTH_ARG(KAUTH_REQ_PROCESS_RLIMIT_BYPASS),
79904610895Schristos &p2->p_rlimit[RLIMIT_NTHR], KAUTH_ARG(RLIMIT_NTHR))
80004610895Schristos != 0) {
801847d3096Schristos (void)chglwpcnt(uid, -1);
802c2427831Sad mutex_exit(p2->p_lock);
803847d3096Schristos return EAGAIN;
80404610895Schristos }
80504610895Schristos }
80604610895Schristos }
80704610895Schristos
80804610895Schristos /*
809b07ec3fcSad * First off, reap any detached LWP waiting to be collected.
810b07ec3fcSad * We can re-use its LWP structure and turnstile.
811b07ec3fcSad */
812c2427831Sad if ((l2 = p2->p_zomblwp) != NULL) {
813b07ec3fcSad p2->p_zomblwp = NULL;
814c2427831Sad lwp_free(l2, true, false);
815c2427831Sad /* p2 now unlocked by lwp_free() */
81620180cb1Sad KASSERT(l2->l_ts != NULL);
817d831186dSad KASSERT(l2->l_inheritedprio == -1);
818e781af39Syamt KASSERT(SLIST_EMPTY(&l2->l_pi_lenders));
81920180cb1Sad memset(&l2->l_startzero, 0, sizeof(*l2) -
82020180cb1Sad offsetof(lwp_t, l_startzero));
821c2427831Sad } else {
822c2427831Sad mutex_exit(p2->p_lock);
823c2427831Sad l2 = pool_cache_get(lwp_cache, PR_WAITOK);
82420180cb1Sad memset(&l2->l_startzero, 0, sizeof(*l2) -
82520180cb1Sad offsetof(lwp_t, l_startzero));
826c2427831Sad SLIST_INIT(&l2->l_pi_lenders);
827b07ec3fcSad }
828e0d8d366Sthorpej
82920180cb1Sad /*
83020180cb1Sad * Because of lockless lookup via pid_table, the LWP can be locked
83120180cb1Sad * and inspected briefly even after it's freed, so a few fields are
83220180cb1Sad * kept stable.
83320180cb1Sad */
83420180cb1Sad KASSERT(l2->l_stat == LSIDL);
83520180cb1Sad KASSERT(l2->l_cpu != NULL);
83620180cb1Sad KASSERT(l2->l_ts != NULL);
83720180cb1Sad KASSERT(l2->l_mutex == l2->l_cpu->ci_schedstate.spc_lwplock);
83820180cb1Sad
839e0d8d366Sthorpej l2->l_proc = p2;
8406a317a61Sad l2->l_refcnt = 0;
841d831186dSad l2->l_class = sclass;
842ea8a9257Sad
843ea8a9257Sad /*
84415689570Sthorpej * Allocate a process ID for this LWP. We need to do this now
845f42f89fdSandvar * while we can still unwind if it fails. Because we're marked
84620180cb1Sad * as LSIDL, no lookups by the ID will succeed.
84715689570Sthorpej *
84815689570Sthorpej * N.B. this will always succeed for the first LWP in a process,
84915689570Sthorpej * because proc_alloc_lwpid() will usurp the slot. Also note
85015689570Sthorpej * that l2->l_proc MUST be valid so that lookups of the proc
85115689570Sthorpej * will succeed, even if the LWP itself is not visible.
85215689570Sthorpej */
85315689570Sthorpej if (__predict_false(proc_alloc_lwpid(p2, l2) == -1)) {
85415689570Sthorpej pool_cache_put(lwp_cache, l2);
85515689570Sthorpej return EAGAIN;
85615689570Sthorpej }
85715689570Sthorpej
858cbc1d2c4Sad /*
859cbc1d2c4Sad * If vfork(), we want the LWP to run fast and on the same CPU
860cbc1d2c4Sad * as its parent, so that it can reuse the VM context and cache
861cbc1d2c4Sad * footprint on the local CPU.
862cbc1d2c4Sad */
863cbc1d2c4Sad l2->l_boostpri = ((flags & LWP_VFORK) ? PRI_KERNEL : PRI_USER);
864b07ec3fcSad l2->l_priority = l1->l_priority;
865d831186dSad l2->l_inheritedprio = -1;
8667cf7644fSchristos l2->l_protectprio = -1;
8677cf7644fSchristos l2->l_auxprio = -1;
8686c8b9827Sad l2->l_flag = 0;
8694a780c9aSad l2->l_pflag = LP_MPSAFE;
870a4e0004bSad TAILQ_INIT(&l2->l_ld_locks);
8713843688cSozaki-r l2->l_psrefs = 0;
87210c5b023Smaxv kmsan_lwp_alloc(l2);
87304e486d9Sthorpej
8743cb7a24bSad /*
875e820d9feSpooka * For vfork, borrow parent's lwpctl context if it exists.
876e820d9feSpooka * This also causes us to return via lwp_userret.
877e820d9feSpooka */
878e820d9feSpooka if (flags & LWP_VFORK && l1->l_lwpctl) {
879e820d9feSpooka l2->l_lwpctl = l1->l_lwpctl;
880e820d9feSpooka l2->l_flag |= LW_LWPCTL;
881e820d9feSpooka }
882e820d9feSpooka
883e820d9feSpooka /*
8843cb7a24bSad * If not the first LWP in the process, grab a reference to the
8853cb7a24bSad * descriptor table.
8863cb7a24bSad */
8873cb7a24bSad l2->l_fd = p2->p_fd;
8883cb7a24bSad if (p2->p_nlwps != 0) {
8893cb7a24bSad KASSERT(l1->l_proc == p2);
890e4be2748Srmind fd_hold(l2);
8913cb7a24bSad } else {
8923cb7a24bSad KASSERT(l1->l_proc != p2);
8933cb7a24bSad }
8943cb7a24bSad
895934634a1Spavel if (p2->p_flag & PK_SYSTEM) {
89640cf6f36Srmind /* Mark it as a system LWP. */
897934634a1Spavel l2->l_flag |= LW_SYSTEM;
898b07ec3fcSad }
899e0d8d366Sthorpej
9001bc28ea1Sdarran kdtrace_thread_ctor(NULL, l2);
901a21233e4Srmind lwp_initspecific(l2);
902d831186dSad sched_lwp_fork(l1, l2);
903513227e9Sad callout_init(&l2->l_timeout_ch, CALLOUT_MPSAFE);
904513227e9Sad callout_setfunc(&l2->l_timeout_ch, sleepq_timeout, l2);
905b07ec3fcSad cv_init(&l2->l_sigcv, "sigwait");
9066d7c7959Srmind cv_init(&l2->l_waitcv, "vfork");
907b07ec3fcSad l2->l_syncobj = &sched_syncobj;
9087fc219a5Sozaki-r PSREF_DEBUG_INIT_LWP(l2);
909e0d8d366Sthorpej
910e0d8d366Sthorpej if (rnewlwpp != NULL)
911e0d8d366Sthorpej *rnewlwpp = l2;
912e0d8d366Sthorpej
913f3c47d39Smatt /*
914f3c47d39Smatt * PCU state needs to be saved before calling uvm_lwp_fork() so that
915f3c47d39Smatt * the MD cpu_lwp_fork() can copy the saved state to the new LWP.
916f3c47d39Smatt */
917f3c47d39Smatt pcu_save_all(l1);
918277fc727Sdogcow #if PCU_UNIT_COUNT > 0
919a9d52c09Sriastradh l2->l_pcu_valid = l1->l_pcu_valid;
920277fc727Sdogcow #endif
921f3c47d39Smatt
92210697458Srmind uvm_lwp_setuarea(l2, uaddr);
92334397172Sskrll uvm_lwp_fork(l1, l2, stack, stacksize, func, (arg != NULL) ? arg : l2);
924e0d8d366Sthorpej
92515689570Sthorpej mutex_enter(p2->p_lock);
9260f335007Sad l2->l_cred = kauth_cred_hold(p2->p_cred);
927b07ec3fcSad if ((flags & LWP_DETACHED) != 0) {
928b07ec3fcSad l2->l_prflag = LPR_DETACHED;
929b07ec3fcSad p2->p_ndlwps++;
930b07ec3fcSad } else
931b07ec3fcSad l2->l_prflag = 0;
932b07ec3fcSad
933d1c42b4fSad if (l1->l_proc == p2) {
934d1c42b4fSad /*
935d1c42b4fSad * These flags are set while p_lock is held. Copy with
936d1c42b4fSad * p_lock held too, so the LWP doesn't sneak into the
937d1c42b4fSad * process without them being set.
938d1c42b4fSad */
9396c8b9827Sad l2->l_flag |= (l1->l_flag & (LW_WEXIT | LW_WREBOOT | LW_WCORE));
940d1c42b4fSad } else {
941d1c42b4fSad /* fork(): pending core/exit doesn't apply to child. */
9426c8b9827Sad l2->l_flag |= (l1->l_flag & LW_WREBOOT);
943d1c42b4fSad }
9446c8b9827Sad
945d7746f2eSchristos l2->l_sigstk = *sigstk;
946d7746f2eSchristos l2->l_sigmask = *sigmask;
947471b216bSchristos TAILQ_INIT(&l2->l_sigpend.sp_info);
948b07ec3fcSad sigemptyset(&l2->l_sigpend.sp_set);
949e0d8d366Sthorpej LIST_INSERT_HEAD(&p2->p_lwps, l2, l_sibling);
950e0d8d366Sthorpej p2->p_nlwps++;
951d5dec378Syamt p2->p_nrlwps++;
952e0d8d366Sthorpej
953501dd321Srmind KASSERT(l2->l_affinity == NULL);
954501dd321Srmind
955501dd321Srmind /* Inherit the affinity mask. */
956501dd321Srmind if (l1->l_affinity) {
9574f1720c3Srmind /*
9584f1720c3Srmind * Note that we hold the state lock while inheriting
9594f1720c3Srmind * the affinity to avoid race with sched_setaffinity().
9604f1720c3Srmind */
9614f1720c3Srmind lwp_lock(l1);
962501dd321Srmind if (l1->l_affinity) {
963481ae155Srmind kcpuset_use(l1->l_affinity);
964f30b5785Schristos l2->l_affinity = l1->l_affinity;
9654f91cff0Srmind }
9664f1720c3Srmind lwp_unlock(l1);
967f30b5785Schristos }
968d1c42b4fSad
969725adb2aSad /* Ensure a trip through lwp_userret() if needed. */
970725adb2aSad if ((l2->l_flag & LW_USERRET) != 0) {
971725adb2aSad lwp_need_userret(l2);
972725adb2aSad }
973725adb2aSad
974d1c42b4fSad /* This marks the end of the "must be atomic" section. */
9754f1720c3Srmind mutex_exit(p2->p_lock);
9764f1720c3Srmind
977db70f181Schristos SDT_PROBE(proc, kernel, , lwp__create, l2, 0, 0, 0, 0);
9786a9056a9Sdarran
9790eaaa024Sad mutex_enter(&proc_lock);
9804f1720c3Srmind LIST_INSERT_HEAD(&alllwp, l2, l_list);
981f15dda4bSad /* Inherit a processor-set */
982f15dda4bSad l2->l_psid = l1->l_psid;
9830eaaa024Sad mutex_exit(&proc_lock);
9845c71a4d4Srmind
985d64834e4Sdsl SYSCALL_TIME_LWP_INIT(l2);
986d64834e4Sdsl
987b23b73b9Smanu if (p2->p_emul->e_lwp_fork)
988b23b73b9Smanu (*p2->p_emul->e_lwp_fork)(l1, l2);
989b23b73b9Smanu
990e0d8d366Sthorpej return (0);
991e0d8d366Sthorpej }
992e0d8d366Sthorpej
993e0d8d366Sthorpej /*
99411ba4e18Sad * Set a new LWP running. If the process is stopping, then the LWP is
99511ba4e18Sad * created stopped.
99611ba4e18Sad */
99711ba4e18Sad void
lwp_start(lwp_t * l,int flags)99811ba4e18Sad lwp_start(lwp_t *l, int flags)
99911ba4e18Sad {
100011ba4e18Sad proc_t *p = l->l_proc;
100111ba4e18Sad
100211ba4e18Sad mutex_enter(p->p_lock);
100311ba4e18Sad lwp_lock(l);
100411ba4e18Sad KASSERT(l->l_stat == LSIDL);
100511ba4e18Sad if ((flags & LWP_SUSPENDED) != 0) {
100611ba4e18Sad /* It'll suspend itself in lwp_userret(). */
100711ba4e18Sad l->l_flag |= LW_WSUSPEND;
100827711c94Sad lwp_need_userret(l);
100911ba4e18Sad }
101011ba4e18Sad if (p->p_stat == SSTOP || (p->p_sflag & PS_STOPPING) != 0) {
101111ba4e18Sad KASSERT(l->l_wchan == NULL);
101211ba4e18Sad l->l_stat = LSSTOP;
101311ba4e18Sad p->p_nrlwps--;
101411ba4e18Sad lwp_unlock(l);
101511ba4e18Sad } else {
101611ba4e18Sad setrunnable(l);
101711ba4e18Sad /* LWP now unlocked */
101811ba4e18Sad }
101911ba4e18Sad mutex_exit(p->p_lock);
102011ba4e18Sad }
102111ba4e18Sad
102211ba4e18Sad /*
1023f0301095Syamt * Called by MD code when a new LWP begins execution. Must be called
1024f0301095Syamt * with the previous LWP locked (so at splsched), or if there is no
1025f0301095Syamt * previous LWP, at splsched.
1026f0301095Syamt */
1027f0301095Syamt void
lwp_startup(struct lwp * prev,struct lwp * new_lwp)1028a35d1a8cSmatt lwp_startup(struct lwp *prev, struct lwp *new_lwp)
1029f0301095Syamt {
103082002773Sad kmutex_t *lock;
10312ddceed1Sad
1032a35d1a8cSmatt KASSERTMSG(new_lwp == curlwp, "l %p curlwp %p prevlwp %p", new_lwp, curlwp, prev);
10334c7ba244Sad KASSERT(kpreempt_disabled());
10342ddceed1Sad KASSERT(prev != NULL);
103582002773Sad KASSERT((prev->l_pflag & LP_RUNNING) != 0);
10362ddceed1Sad KASSERT(curcpu()->ci_mtx_count == -2);
10372ddceed1Sad
103882002773Sad /*
1039b2097dd8Sriastradh * Immediately mark the previous LWP as no longer running and
1040b2097dd8Sriastradh * unlock (to keep lock wait times short as possible). If a
1041b2097dd8Sriastradh * zombie, don't touch after clearing LP_RUNNING as it could be
1042b2097dd8Sriastradh * reaped by another CPU. Use atomic_store_release to ensure
1043b2097dd8Sriastradh * this -- matches atomic_load_acquire in lwp_free.
104482002773Sad */
104582002773Sad lock = prev->l_mutex;
104682002773Sad if (__predict_false(prev->l_stat == LSZOMB)) {
1047b2097dd8Sriastradh atomic_store_release(&prev->l_pflag,
1048b2097dd8Sriastradh prev->l_pflag & ~LP_RUNNING);
1049b2097dd8Sriastradh } else {
105082002773Sad prev->l_pflag &= ~LP_RUNNING;
1051b2097dd8Sriastradh }
105282002773Sad mutex_spin_exit(lock);
10532ddceed1Sad
10542ddceed1Sad /* Correct spin mutex count after mi_switch(). */
10552ddceed1Sad curcpu()->ci_mtx_count = 0;
10562ddceed1Sad
10572ddceed1Sad /* Install new VM context. */
10582ddceed1Sad if (__predict_true(new_lwp->l_proc->p_vmspace)) {
1059a35d1a8cSmatt pmap_activate(new_lwp);
10602ddceed1Sad }
10612ddceed1Sad
10622ddceed1Sad /* We remain at IPL_SCHED from mi_switch() - reset it. */
1063dea60533Sskrll spl0();
106444968cbaSchristos
1065f0301095Syamt LOCKDEBUG_BARRIER(NULL, 0);
10662ddceed1Sad SDT_PROBE(proc, kernel, , lwp__start, new_lwp, 0, 0, 0, 0);
10672ddceed1Sad
10682ddceed1Sad /* For kthreads, acquire kernel lock if not MPSAFE. */
10692ddceed1Sad if (__predict_false((new_lwp->l_pflag & LP_MPSAFE) == 0)) {
1070a35d1a8cSmatt KERNEL_LOCK(1, new_lwp);
1071f0301095Syamt }
107288ab7da9Sad }
1073f0301095Syamt
1074f0301095Syamt /*
107588ab7da9Sad * Exit an LWP.
10763ce4840cSad *
10773ce4840cSad * *** WARNING *** This can be called with (l != curlwp) in error paths.
1078e0d8d366Sthorpej */
1079e0d8d366Sthorpej void
lwp_exit(struct lwp * l)1080e0d8d366Sthorpej lwp_exit(struct lwp *l)
1081e0d8d366Sthorpej {
1082e0d8d366Sthorpej struct proc *p = l->l_proc;
1083b07ec3fcSad struct lwp *l2;
108488ab7da9Sad bool current;
108588ab7da9Sad
108688ab7da9Sad current = (l == curlwp);
1087e0d8d366Sthorpej
10880dec6ba3Sriastradh KASSERT(current || l->l_stat == LSIDL);
10890dec6ba3Sriastradh KASSERT(current || l->l_target_cpu == NULL);
10903cb7a24bSad KASSERT(p == curproc);
1091b23b73b9Smanu
1092db70f181Schristos SDT_PROBE(proc, kernel, , lwp__exit, l, 0, 0, 0, 0);
10936a9056a9Sdarran
109420f33b02Sad /* Verify that we hold no locks; for DIAGNOSTIC check kernel_lock. */
10952ddceed1Sad LOCKDEBUG_BARRIER(NULL, 0);
109620f33b02Sad KASSERTMSG(curcpu()->ci_biglock_count == 0, "kernel_lock leaked");
1097b07ec3fcSad
1098b07ec3fcSad /*
1099b07ec3fcSad * If we are the last live LWP in a process, we need to exit the
1100b07ec3fcSad * entire process. We do so with an exit status of zero, because
1101b07ec3fcSad * it's a "controlled" exit, and because that's what Solaris does.
1102b07ec3fcSad *
1103b07ec3fcSad * We are not quite a zombie yet, but for accounting purposes we
1104b07ec3fcSad * must increment the count of zombies here.
110512e8bb91Sthorpej *
110612e8bb91Sthorpej * Note: the last LWP's specificdata will be deleted here.
1107e0d8d366Sthorpej */
1108284c2b9aSad mutex_enter(p->p_lock);
1109b07ec3fcSad if (p->p_nlwps - p->p_nzlwps == 1) {
111088ab7da9Sad KASSERT(current == true);
11112411d236Smatt KASSERT(p != &proc0);
11125c35dbcdSchristos exit1(l, 0, 0);
1113089abdadSjdolecek /* NOTREACHED */
1114e0d8d366Sthorpej }
1115b07ec3fcSad p->p_nzlwps++;
111698a9cebbSthorpej
111798a9cebbSthorpej /*
111898a9cebbSthorpej * Perform any required thread cleanup. Do this early so
111915689570Sthorpej * anyone wanting to look us up with lwp_getref_lwpid() will
112015689570Sthorpej * fail to find us before we become a zombie.
112198a9cebbSthorpej *
112298a9cebbSthorpej * N.B. this will unlock p->p_lock on our behalf.
112398a9cebbSthorpej */
112498a9cebbSthorpej lwp_thread_cleanup(l);
1125b07ec3fcSad
1126b07ec3fcSad if (p->p_emul->e_lwp_exit)
1127b07ec3fcSad (*p->p_emul->e_lwp_exit)(l);
1128e0d8d366Sthorpej
11293cb7a24bSad /* Drop filedesc reference. */
11303cb7a24bSad fd_free();
11313cb7a24bSad
113272421a19Shannken /* Release fstrans private data. */
113372421a19Shannken fstrans_lwp_dtor(l);
113472421a19Shannken
113512e8bb91Sthorpej /* Delete the specificdata while it's still safe to sleep. */
11362c4f731dSpooka lwp_finispecific(l);
113712e8bb91Sthorpej
1138b07ec3fcSad /*
1139b07ec3fcSad * Release our cached credentials.
1140b07ec3fcSad */
11412b79369cSad kauth_cred_free(l->l_cred);
1142513227e9Sad callout_destroy(&l->l_timeout_ch);
114388ab7da9Sad
114488ab7da9Sad /*
1145d1fa1f15Skamil * If traced, report LWP exit event to the debugger.
1146d1fa1f15Skamil *
1147b07ec3fcSad * Remove the LWP from the global list.
114833fa5ccbSchs * Free its LID from the PID namespace if needed.
1149b07ec3fcSad */
11500eaaa024Sad mutex_enter(&proc_lock);
1151d1fa1f15Skamil
1152ac37cdceSkamil if ((p->p_slflag & (PSL_TRACED|PSL_TRACELWP_EXIT)) ==
1153d1fa1f15Skamil (PSL_TRACED|PSL_TRACELWP_EXIT)) {
1154d1fa1f15Skamil mutex_enter(p->p_lock);
1155bcb2d047Skamil if (ISSET(p->p_sflag, PS_WEXIT)) {
1156bcb2d047Skamil mutex_exit(p->p_lock);
1157bcb2d047Skamil /*
1158bcb2d047Skamil * We are exiting, bail out without informing parent
1159bcb2d047Skamil * about a terminating LWP as it would deadlock.
1160bcb2d047Skamil */
1161bcb2d047Skamil } else {
11625e4bbc49Skamil eventswitch(TRAP_LWP, PTRACE_LWP_EXIT, l->l_lid);
11630eaaa024Sad mutex_enter(&proc_lock);
1164d1fa1f15Skamil }
1165bcb2d047Skamil }
1166d1fa1f15Skamil
1167b07ec3fcSad LIST_REMOVE(l, l_list);
11680eaaa024Sad mutex_exit(&proc_lock);
1169b07ec3fcSad
1170b07ec3fcSad /*
1171b07ec3fcSad * Get rid of all references to the LWP that others (e.g. procfs)
1172b07ec3fcSad * may have, and mark the LWP as a zombie. If the LWP is detached,
1173b07ec3fcSad * mark it waiting for collection in the proc structure. Note that
1174*bcfabd50Sandvar * before we can do that, we need to free any other dead, detached
1175b07ec3fcSad * LWP waiting to meet its maker.
11766a317a61Sad *
11776a317a61Sad * All conditions need to be observed upon under the same hold of
11786a317a61Sad * p_lock, because if the lock is dropped any of them can change.
1179b07ec3fcSad */
1180284c2b9aSad mutex_enter(p->p_lock);
11816a317a61Sad for (;;) {
118298a9cebbSthorpej if (lwp_drainrefs(l))
11836a317a61Sad continue;
1184b07ec3fcSad if ((l->l_prflag & LPR_DETACHED) != 0) {
11856a317a61Sad if ((l2 = p->p_zomblwp) != NULL) {
1186b07ec3fcSad p->p_zomblwp = NULL;
11876a317a61Sad lwp_free(l2, false, false);
11886a317a61Sad /* proc now unlocked */
1189284c2b9aSad mutex_enter(p->p_lock);
11906a317a61Sad continue;
1191b07ec3fcSad }
1192b07ec3fcSad p->p_zomblwp = l;
1193b07ec3fcSad }
11946a317a61Sad break;
11956a317a61Sad }
1196b07ec3fcSad
1197b07ec3fcSad /*
1198b07ec3fcSad * If we find a pending signal for the process and we have been
119933fa5ccbSchs * asked to check for signals, then we lose: arrange to have
1200b07ec3fcSad * all other LWPs in the process check for signals.
1201b07ec3fcSad */
1202934634a1Spavel if ((l->l_flag & LW_PENDSIG) != 0 &&
1203b07ec3fcSad firstsig(&p->p_sigpend.sp_set) != 0) {
1204b07ec3fcSad LIST_FOREACH(l2, &p->p_lwps, l_sibling) {
1205b07ec3fcSad lwp_lock(l2);
1206e57dd2baSad signotify(l2);
1207b07ec3fcSad lwp_unlock(l2);
1208b07ec3fcSad }
1209b07ec3fcSad }
1210b07ec3fcSad
1211f3c47d39Smatt /*
1212f3c47d39Smatt * Release any PCU resources before becoming a zombie.
1213f3c47d39Smatt */
1214f3c47d39Smatt pcu_discard_all(l);
1215f3c47d39Smatt
1216b07ec3fcSad lwp_lock(l);
1217b07ec3fcSad l->l_stat = LSZOMB;
1218501dd321Srmind if (l->l_name != NULL) {
121979aa087aSad strcpy(l->l_name, "(zombie)");
12204f1720c3Srmind }
1221b07ec3fcSad lwp_unlock(l);
1222b07ec3fcSad p->p_nrlwps--;
1223b668a9a0Sad if (l->l_lwpctl != NULL)
1224b668a9a0Sad l->l_lwpctl->lc_curcpu = LWPCTL_CPU_EXITED;
1225284c2b9aSad mutex_exit(p->p_lock);
12263d1cabfdSad cv_broadcast(&p->p_lwpcv);
1227b07ec3fcSad
1228b07ec3fcSad /*
1229b07ec3fcSad * We can no longer block. At this point, lwp_free() may already
1230b07ec3fcSad * be gunning for us. On a multi-CPU system, we may be off p_lwps.
1231b07ec3fcSad *
1232b07ec3fcSad * Free MD LWP resources.
1233b07ec3fcSad */
1234089abdadSjdolecek cpu_lwp_free(l, 0);
123588ab7da9Sad
123688ab7da9Sad if (current) {
12372ddceed1Sad /* Switch away into oblivion. */
12382ddceed1Sad lwp_lock(l);
12392ddceed1Sad spc_lock(l->l_cpu);
12402ddceed1Sad mi_switch(l);
12412ddceed1Sad panic("lwp_exit");
1242e0d8d366Sthorpej }
124388ab7da9Sad }
1244e0d8d366Sthorpej
1245b07ec3fcSad /*
1246b07ec3fcSad * Free a dead LWP's remaining resources.
1247b07ec3fcSad *
1248b07ec3fcSad * XXXLWP limits.
1249b07ec3fcSad */
1250b07ec3fcSad void
lwp_free(struct lwp * l,bool recycle,bool last)1251fed17936Sad lwp_free(struct lwp *l, bool recycle, bool last)
1252b07ec3fcSad {
1253b07ec3fcSad struct proc *p = l->l_proc;
1254be04ac48Sad struct rusage *ru;
1255b07ec3fcSad ksiginfoq_t kq;
1256b07ec3fcSad
125721832401Syamt KASSERT(l != curlwp);
1258b6608b64Syamt KASSERT(last || mutex_owned(p->p_lock));
125921832401Syamt
12606cce1f9fSchristos /*
12616cce1f9fSchristos * We use the process credentials instead of the lwp credentials here
12626cce1f9fSchristos * because the lwp credentials maybe cached (just after a setuid call)
12636cce1f9fSchristos * and we don't want pay for syncing, since the lwp is going away
12646cce1f9fSchristos * anyway
12656cce1f9fSchristos */
126604610895Schristos if (p != &proc0 && p->p_nlwps != 1)
12676cce1f9fSchristos (void)chglwpcnt(kauth_cred_getuid(p->p_cred), -1);
12682ddceed1Sad
1269b07ec3fcSad /*
127020180cb1Sad * In the unlikely event that the LWP is still on the CPU,
127120180cb1Sad * then spin until it has switched away.
1272b2097dd8Sriastradh *
1273b2097dd8Sriastradh * atomic_load_acquire matches atomic_store_release in
1274b2097dd8Sriastradh * lwp_startup and mi_switch.
127520180cb1Sad */
1276b2097dd8Sriastradh while (__predict_false((atomic_load_acquire(&l->l_pflag) & LP_RUNNING)
1277b2097dd8Sriastradh != 0)) {
127820180cb1Sad SPINLOCK_BACKOFF_HOOK;
127920180cb1Sad }
128020180cb1Sad
128120180cb1Sad /*
128220180cb1Sad * Now that the LWP's known off the CPU, reset its state back to
128320180cb1Sad * LSIDL, which defeats anything that might have gotten a hold on
128420180cb1Sad * the LWP via pid_table before the ID was freed. It's important
128520180cb1Sad * to do this with both the LWP locked and p_lock held.
128620180cb1Sad *
128720180cb1Sad * Also reset the CPU and lock pointer back to curcpu(), since the
128820180cb1Sad * LWP will in all likelyhood be cached with the current CPU in
128920180cb1Sad * lwp_cache when we free it and later allocated from there again
129020180cb1Sad * (avoid incidental lock contention).
129120180cb1Sad */
129220180cb1Sad lwp_lock(l);
129320180cb1Sad l->l_stat = LSIDL;
129420180cb1Sad l->l_cpu = curcpu();
129520180cb1Sad lwp_unlock_to(l, l->l_cpu->ci_schedstate.spc_lwplock);
129620180cb1Sad
129720180cb1Sad /*
1298d1c42b4fSad * If this was not the last LWP in the process, then adjust counters
1299d1c42b4fSad * and unlock. This is done differently for the last LWP in exit1().
1300b07ec3fcSad */
1301b07ec3fcSad if (!last) {
1302b07ec3fcSad /*
1303b07ec3fcSad * Add the LWP's run time to the process' base value.
1304b07ec3fcSad * This needs to co-incide with coming off p_lwps.
1305b07ec3fcSad */
1306949e16d9Syamt bintime_add(&p->p_rtime, &l->l_rtime);
1307f0301095Syamt p->p_pctcpu += l->l_pctcpu;
1308be04ac48Sad ru = &p->p_stats->p_ru;
1309be04ac48Sad ruadd(ru, &l->l_ru);
1310b07ec3fcSad LIST_REMOVE(l, l_sibling);
1311b07ec3fcSad p->p_nlwps--;
1312b07ec3fcSad p->p_nzlwps--;
1313b07ec3fcSad if ((l->l_prflag & LPR_DETACHED) != 0)
1314b07ec3fcSad p->p_ndlwps--;
13153d1cabfdSad mutex_exit(p->p_lock);
1316fed17936Sad
1317fed17936Sad /*
1318fed17936Sad * Have any LWPs sleeping in lwp_wait() recheck for
1319fed17936Sad * deadlock.
1320fed17936Sad */
1321fed17936Sad cv_broadcast(&p->p_lwpcv);
1322b07ec3fcSad
132320180cb1Sad /* Free the LWP ID. */
13240eaaa024Sad mutex_enter(&proc_lock);
132520180cb1Sad proc_free_lwpid(p, l->l_lid);
13260eaaa024Sad mutex_exit(&proc_lock);
1327b07ec3fcSad }
1328b07ec3fcSad
1329b07ec3fcSad /*
1330b07ec3fcSad * Destroy the LWP's remaining signal information.
1331b07ec3fcSad */
1332b07ec3fcSad ksiginfo_queue_init(&kq);
1333b07ec3fcSad sigclear(&l->l_sigpend, NULL, &kq);
1334b07ec3fcSad ksiginfo_queue_drain(&kq);
1335b07ec3fcSad cv_destroy(&l->l_sigcv);
13366d7c7959Srmind cv_destroy(&l->l_waitcv);
1337b07ec3fcSad
1338b07ec3fcSad /*
1339501dd321Srmind * Free lwpctl structure and affinity.
1340501dd321Srmind */
1341501dd321Srmind if (l->l_lwpctl) {
1342501dd321Srmind lwp_ctl_free(l);
1343501dd321Srmind }
1344501dd321Srmind if (l->l_affinity) {
1345501dd321Srmind kcpuset_unuse(l->l_affinity, NULL);
1346501dd321Srmind l->l_affinity = NULL;
1347501dd321Srmind }
1348501dd321Srmind
1349501dd321Srmind /*
135020180cb1Sad * Free remaining data structures and the LWP itself unless the
135120180cb1Sad * caller wants to recycle.
1352b07ec3fcSad */
135379aa087aSad if (l->l_name != NULL)
135479aa087aSad kmem_free(l->l_name, MAXCOMLEN);
135533963b14Srmind
135610c5b023Smaxv kmsan_lwp_free(l);
1357b3036422Smaxv kcov_lwp_free(l);
1358b07ec3fcSad cpu_lwp_free2(l);
1359b07ec3fcSad uvm_lwp_exit(l);
136040cf6f36Srmind
1361e781af39Syamt KASSERT(SLIST_EMPTY(&l->l_pi_lenders));
1362d831186dSad KASSERT(l->l_inheritedprio == -1);
136318bf160fSmatt KASSERT(l->l_blcnt == 0);
13641bc28ea1Sdarran kdtrace_thread_dtor(NULL, l);
1365b07ec3fcSad if (!recycle)
1366ea3f10f7Sad pool_cache_put(lwp_cache, l);
1367e0d8d366Sthorpej }
1368e0d8d366Sthorpej
1369e0d8d366Sthorpej /*
13705c71a4d4Srmind * Migrate the LWP to the another CPU. Unlocks the LWP.
13715c71a4d4Srmind */
13725c71a4d4Srmind void
lwp_migrate(lwp_t * l,struct cpu_info * tci)137329170d38Srmind lwp_migrate(lwp_t *l, struct cpu_info *tci)
13745c71a4d4Srmind {
137529170d38Srmind struct schedstate_percpu *tspc;
137630dfdb28Srmind int lstat = l->l_stat;
137730dfdb28Srmind
13785c71a4d4Srmind KASSERT(lwp_locked(l, NULL));
137929170d38Srmind KASSERT(tci != NULL);
13805c71a4d4Srmind
138130dfdb28Srmind /* If LWP is still on the CPU, it must be handled like LSONPROC */
138282002773Sad if ((l->l_pflag & LP_RUNNING) != 0) {
138330dfdb28Srmind lstat = LSONPROC;
138430dfdb28Srmind }
138530dfdb28Srmind
138629170d38Srmind /*
138729170d38Srmind * The destination CPU could be changed while previous migration
138829170d38Srmind * was not finished.
138929170d38Srmind */
139030dfdb28Srmind if (l->l_target_cpu != NULL) {
139129170d38Srmind l->l_target_cpu = tci;
13925c71a4d4Srmind lwp_unlock(l);
13935c71a4d4Srmind return;
13945c71a4d4Srmind }
13955c71a4d4Srmind
139629170d38Srmind /* Nothing to do if trying to migrate to the same CPU */
139729170d38Srmind if (l->l_cpu == tci) {
139829170d38Srmind lwp_unlock(l);
139929170d38Srmind return;
140029170d38Srmind }
140129170d38Srmind
140229170d38Srmind KASSERT(l->l_target_cpu == NULL);
140329170d38Srmind tspc = &tci->ci_schedstate;
140430dfdb28Srmind switch (lstat) {
14055c71a4d4Srmind case LSRUN:
140629170d38Srmind l->l_target_cpu = tci;
140740cf6f36Srmind break;
14085c71a4d4Srmind case LSSLEEP:
140929170d38Srmind l->l_cpu = tci;
14105c71a4d4Srmind break;
141111ba4e18Sad case LSIDL:
14125c71a4d4Srmind case LSSTOP:
14135c71a4d4Srmind case LSSUSPENDED:
141429170d38Srmind l->l_cpu = tci;
141529170d38Srmind if (l->l_wchan == NULL) {
141629170d38Srmind lwp_unlock_to(l, tspc->spc_lwplock);
141729170d38Srmind return;
14185c71a4d4Srmind }
141929170d38Srmind break;
14205c71a4d4Srmind case LSONPROC:
142129170d38Srmind l->l_target_cpu = tci;
142229170d38Srmind spc_lock(l->l_cpu);
142311ba4e18Sad sched_resched_cpu(l->l_cpu, PRI_USER_RT, true);
142411ba4e18Sad /* spc now unlocked */
14255c71a4d4Srmind break;
14265c71a4d4Srmind }
14275c71a4d4Srmind lwp_unlock(l);
14285c71a4d4Srmind }
14295c71a4d4Srmind
143059150873Sthorpej #define lwp_find_exclude(l) \
143159150873Sthorpej ((l)->l_stat == LSIDL || (l)->l_stat == LSZOMB)
143259150873Sthorpej
14335c71a4d4Srmind /*
14349850c055Srmind * Find the LWP in the process. Arguments may be zero, in such case,
14359850c055Srmind * the calling process and first LWP in the list will be used.
1436284c2b9aSad * On success - returns proc locked.
143759150873Sthorpej *
143859150873Sthorpej * => pid == 0 -> look in curproc.
143959150873Sthorpej * => pid == -1 -> match any proc.
144059150873Sthorpej * => otherwise look up the proc.
144159150873Sthorpej *
144259150873Sthorpej * => lid == 0 -> first LWP in the proc
144359150873Sthorpej * => otherwise specific LWP
14445c71a4d4Srmind */
14455c71a4d4Srmind struct lwp *
lwp_find2(pid_t pid,lwpid_t lid)14465c71a4d4Srmind lwp_find2(pid_t pid, lwpid_t lid)
14475c71a4d4Srmind {
14485c71a4d4Srmind proc_t *p;
14495c71a4d4Srmind lwp_t *l;
14505c71a4d4Srmind
145159150873Sthorpej /* First LWP of specified proc. */
145259150873Sthorpej if (lid == 0) {
145359150873Sthorpej switch (pid) {
145459150873Sthorpej case -1:
145559150873Sthorpej /* No lookup keys. */
145659150873Sthorpej return NULL;
145759150873Sthorpej case 0:
145859150873Sthorpej p = curproc;
145959150873Sthorpej mutex_enter(p->p_lock);
146059150873Sthorpej break;
146159150873Sthorpej default:
14620eaaa024Sad mutex_enter(&proc_lock);
14633c507045Srmind p = proc_find(pid);
146459150873Sthorpej if (__predict_false(p == NULL)) {
14650eaaa024Sad mutex_exit(&proc_lock);
14663c507045Srmind return NULL;
14679850c055Srmind }
14683c507045Srmind mutex_enter(p->p_lock);
14690eaaa024Sad mutex_exit(&proc_lock);
147059150873Sthorpej break;
14713c507045Srmind }
147259150873Sthorpej LIST_FOREACH(l, &p->p_lwps, l_sibling) {
147359150873Sthorpej if (__predict_true(!lwp_find_exclude(l)))
147459150873Sthorpej break;
14753c507045Srmind }
147659150873Sthorpej goto out;
147759150873Sthorpej }
147859150873Sthorpej
147959150873Sthorpej l = proc_find_lwp_acquire_proc(lid, &p);
148059150873Sthorpej if (l == NULL)
148159150873Sthorpej return NULL;
148259150873Sthorpej KASSERT(p != NULL);
148359150873Sthorpej KASSERT(mutex_owned(p->p_lock));
148459150873Sthorpej
148559150873Sthorpej if (__predict_false(lwp_find_exclude(l))) {
148659150873Sthorpej l = NULL;
148759150873Sthorpej goto out;
148859150873Sthorpej }
148959150873Sthorpej
149059150873Sthorpej /* Apply proc filter, if applicable. */
149159150873Sthorpej switch (pid) {
149259150873Sthorpej case -1:
149359150873Sthorpej /* Match anything. */
149459150873Sthorpej break;
149559150873Sthorpej case 0:
149659150873Sthorpej if (p != curproc)
149759150873Sthorpej l = NULL;
149859150873Sthorpej break;
149959150873Sthorpej default:
150059150873Sthorpej if (p->p_pid != pid)
150159150873Sthorpej l = NULL;
150259150873Sthorpej break;
150359150873Sthorpej }
150459150873Sthorpej
150559150873Sthorpej out:
150659150873Sthorpej if (__predict_false(l == NULL)) {
1507284c2b9aSad mutex_exit(p->p_lock);
1508284c2b9aSad }
15095c71a4d4Srmind return l;
15105c71a4d4Srmind }
15115c71a4d4Srmind
15125c71a4d4Srmind /*
1513ea845191Syamt * Look up a live LWP within the specified process.
1514b07ec3fcSad *
1515d1c42b4fSad * Must be called with p->p_lock held (as it looks at the radix tree,
1516d1c42b4fSad * and also wants to exclude idle and zombie LWPs).
1517b07ec3fcSad */
1518b07ec3fcSad struct lwp *
lwp_find(struct proc * p,lwpid_t id)151933fa5ccbSchs lwp_find(struct proc *p, lwpid_t id)
1520b07ec3fcSad {
1521b07ec3fcSad struct lwp *l;
1522b07ec3fcSad
1523284c2b9aSad KASSERT(mutex_owned(p->p_lock));
1524b07ec3fcSad
152515689570Sthorpej l = proc_find_lwp(p, id);
1526d1c42b4fSad KASSERT(l == NULL || l->l_lid == id);
1527b07ec3fcSad
1528b07ec3fcSad /*
1529b07ec3fcSad * No need to lock - all of these conditions will
1530b07ec3fcSad * be visible with the process level mutex held.
1531b07ec3fcSad */
153259150873Sthorpej if (__predict_false(l != NULL && lwp_find_exclude(l)))
1533b07ec3fcSad l = NULL;
1534b07ec3fcSad
1535b07ec3fcSad return l;
1536b07ec3fcSad }
1537b07ec3fcSad
1538b07ec3fcSad /*
1539b07ec3fcSad * Verify that an LWP is locked, and optionally verify that the lock matches
1540b07ec3fcSad * one we specify.
1541b07ec3fcSad */
1542b07ec3fcSad int
lwp_locked(struct lwp * l,kmutex_t * mtx)1543b07ec3fcSad lwp_locked(struct lwp *l, kmutex_t *mtx)
1544b07ec3fcSad {
1545b07ec3fcSad kmutex_t *cur = l->l_mutex;
1546b07ec3fcSad
1547b07ec3fcSad return mutex_owned(cur) && (mtx == cur || mtx == NULL);
1548b07ec3fcSad }
1549b07ec3fcSad
1550b07ec3fcSad /*
1551b07ec3fcSad * Lend a new mutex to an LWP. The old mutex must be held.
1552b07ec3fcSad */
15530e70dcbeSad kmutex_t *
lwp_setlock(struct lwp * l,kmutex_t * mtx)1554a35d1a8cSmatt lwp_setlock(struct lwp *l, kmutex_t *mtx)
1555b07ec3fcSad {
15560e70dcbeSad kmutex_t *oldmtx = l->l_mutex;
1557b07ec3fcSad
15580e70dcbeSad KASSERT(mutex_owned(oldmtx));
1559b07ec3fcSad
156076e07a94Sriastradh atomic_store_release(&l->l_mutex, mtx);
15610e70dcbeSad return oldmtx;
1562b07ec3fcSad }
1563b07ec3fcSad
1564b07ec3fcSad /*
1565b07ec3fcSad * Lend a new mutex to an LWP, and release the old mutex. The old mutex
1566b07ec3fcSad * must be held.
1567b07ec3fcSad */
1568b07ec3fcSad void
lwp_unlock_to(struct lwp * l,kmutex_t * mtx)1569a35d1a8cSmatt lwp_unlock_to(struct lwp *l, kmutex_t *mtx)
1570b07ec3fcSad {
1571b07ec3fcSad kmutex_t *old;
1572b07ec3fcSad
157311a35aedSrmind KASSERT(lwp_locked(l, NULL));
1574b07ec3fcSad
1575b07ec3fcSad old = l->l_mutex;
157676e07a94Sriastradh atomic_store_release(&l->l_mutex, mtx);
1577b07ec3fcSad mutex_spin_exit(old);
1578b07ec3fcSad }
1579b07ec3fcSad
1580e781af39Syamt int
lwp_trylock(struct lwp * l)1581e781af39Syamt lwp_trylock(struct lwp *l)
1582e781af39Syamt {
1583e781af39Syamt kmutex_t *old;
1584e781af39Syamt
1585e781af39Syamt for (;;) {
158676e07a94Sriastradh if (!mutex_tryenter(old = atomic_load_consume(&l->l_mutex)))
1587e781af39Syamt return 0;
158876e07a94Sriastradh if (__predict_true(atomic_load_relaxed(&l->l_mutex) == old))
1589e781af39Syamt return 1;
1590e781af39Syamt mutex_spin_exit(old);
1591e781af39Syamt }
1592e781af39Syamt }
1593e781af39Syamt
159440cf6f36Srmind void
lwp_unsleep(lwp_t * l,bool unlock)15950e70dcbeSad lwp_unsleep(lwp_t *l, bool unlock)
1596c42a4d14Sad {
1597c42a4d14Sad
1598c42a4d14Sad KASSERT(mutex_owned(l->l_mutex));
15990e70dcbeSad (*l->l_syncobj->sobj_unsleep)(l, unlock);
1600c42a4d14Sad }
1601c42a4d14Sad
1602b07ec3fcSad /*
16036ed72b5fSad * Lock an LWP.
16046ed72b5fSad */
16056ed72b5fSad void
lwp_lock(lwp_t * l)16066ed72b5fSad lwp_lock(lwp_t *l)
16076ed72b5fSad {
16086ed72b5fSad kmutex_t *old = atomic_load_consume(&l->l_mutex);
16096ed72b5fSad
16106ed72b5fSad /*
16116ed72b5fSad * Note: mutex_spin_enter() will have posted a read barrier.
16126ed72b5fSad * Re-test l->l_mutex. If it has changed, we need to try again.
16136ed72b5fSad */
16146ed72b5fSad mutex_spin_enter(old);
16156ed72b5fSad while (__predict_false(atomic_load_relaxed(&l->l_mutex) != old)) {
16166ed72b5fSad mutex_spin_exit(old);
16176ed72b5fSad old = atomic_load_consume(&l->l_mutex);
16186ed72b5fSad mutex_spin_enter(old);
16196ed72b5fSad }
16206ed72b5fSad }
16216ed72b5fSad
16226ed72b5fSad /*
16236ed72b5fSad * Unlock an LWP.
16246ed72b5fSad */
16256ed72b5fSad void
lwp_unlock(lwp_t * l)16266ed72b5fSad lwp_unlock(lwp_t *l)
16276ed72b5fSad {
16286ed72b5fSad
16296ed72b5fSad mutex_spin_exit(l->l_mutex);
16306ed72b5fSad }
16316ed72b5fSad
16326ed72b5fSad void
lwp_changepri(lwp_t * l,pri_t pri)16336ed72b5fSad lwp_changepri(lwp_t *l, pri_t pri)
16346ed72b5fSad {
16356ed72b5fSad
16366ed72b5fSad KASSERT(mutex_owned(l->l_mutex));
16376ed72b5fSad
16386ed72b5fSad if (l->l_priority == pri)
16396ed72b5fSad return;
16406ed72b5fSad
16416ed72b5fSad (*l->l_syncobj->sobj_changepri)(l, pri);
16426ed72b5fSad KASSERT(l->l_priority == pri);
16436ed72b5fSad }
16446ed72b5fSad
16456ed72b5fSad void
lwp_lendpri(lwp_t * l,pri_t pri)16466ed72b5fSad lwp_lendpri(lwp_t *l, pri_t pri)
16476ed72b5fSad {
16486ed72b5fSad KASSERT(mutex_owned(l->l_mutex));
16496ed72b5fSad
16506ed72b5fSad (*l->l_syncobj->sobj_lendpri)(l, pri);
16516ed72b5fSad KASSERT(l->l_inheritedprio == pri);
16526ed72b5fSad }
16536ed72b5fSad
16546ed72b5fSad pri_t
lwp_eprio(lwp_t * l)16556ed72b5fSad lwp_eprio(lwp_t *l)
16566ed72b5fSad {
16576ed72b5fSad pri_t pri = l->l_priority;
16586ed72b5fSad
16596ed72b5fSad KASSERT(mutex_owned(l->l_mutex));
16606ed72b5fSad
16616ed72b5fSad /*
16626ed72b5fSad * Timeshared/user LWPs get a temporary priority boost for blocking
16636ed72b5fSad * in kernel. This is key to good interactive response on a loaded
16646ed72b5fSad * system: without it, things will seem very sluggish to the user.
16656ed72b5fSad *
16666ed72b5fSad * The function of the boost is to get the LWP onto a CPU and
16676ed72b5fSad * running quickly. Once that happens the LWP loses the priority
16686ed72b5fSad * boost and could be preempted very quickly by another LWP but that
1669*bcfabd50Sandvar * won't happen often enough to be an annoyance.
16706ed72b5fSad */
1671cbc1d2c4Sad if (pri <= MAXPRI_USER && l->l_boostpri > MAXPRI_USER)
1672cbc1d2c4Sad pri = (pri >> 1) + l->l_boostpri;
16736ed72b5fSad
16746ed72b5fSad return MAX(l->l_auxprio, pri);
16756ed72b5fSad }
16766ed72b5fSad
16776ed72b5fSad /*
1678934634a1Spavel * Handle exceptions for mi_userret(). Called if a member of LW_USERRET is
16796ed72b5fSad * set or a preemption is required.
1680b07ec3fcSad */
1681b07ec3fcSad void
lwp_userret(struct lwp * l)1682b07ec3fcSad lwp_userret(struct lwp *l)
1683b07ec3fcSad {
1684b07ec3fcSad struct proc *p;
16856ed72b5fSad int sig, f;
1686b07ec3fcSad
168729170d38Srmind KASSERT(l == curlwp);
168829170d38Srmind KASSERT(l->l_stat == LSONPROC);
1689b07ec3fcSad p = l->l_proc;
1690b07ec3fcSad
16916ed72b5fSad for (;;) {
1692b07ec3fcSad /*
16936ed72b5fSad * This is the main location that user preemptions are
16946ed72b5fSad * processed.
1695b07ec3fcSad */
16966ed72b5fSad preempt_point();
16976ed72b5fSad
16986ed72b5fSad /*
16996ed72b5fSad * It is safe to do this unlocked and without raised SPL,
17006ed72b5fSad * since whenever a flag of interest is added to l_flag the
17016ed72b5fSad * LWP will take an AST and come down this path again. If a
17026ed72b5fSad * remote CPU posts the AST, it will be done with an IPI
17036ed72b5fSad * (strongly synchronising).
17046ed72b5fSad */
17056ed72b5fSad if ((f = atomic_load_relaxed(&l->l_flag) & LW_USERRET) == 0) {
17066ed72b5fSad return;
17076ed72b5fSad }
17086ed72b5fSad
1709b07ec3fcSad /*
171068fa5843Sad * Start out with the correct credentials.
171168fa5843Sad */
171268fa5843Sad if ((f & LW_CACHECRED) != 0) {
171368fa5843Sad kauth_cred_t oc = l->l_cred;
171468fa5843Sad mutex_enter(p->p_lock);
171568fa5843Sad l->l_cred = kauth_cred_hold(p->p_cred);
171668fa5843Sad lwp_lock(l);
171768fa5843Sad l->l_flag &= ~LW_CACHECRED;
171868fa5843Sad lwp_unlock(l);
171968fa5843Sad mutex_exit(p->p_lock);
172068fa5843Sad kauth_cred_free(oc);
172168fa5843Sad }
172268fa5843Sad
172368fa5843Sad /*
1724b07ec3fcSad * Process pending signals first, unless the process
1725681b77ebSad * is dumping core or exiting, where we will instead
17265c0e3318Srmind * enter the LW_WSUSPEND case below.
1727b07ec3fcSad */
17286ed72b5fSad if ((f & (LW_PENDSIG | LW_WCORE | LW_WEXIT)) == LW_PENDSIG) {
1729284c2b9aSad mutex_enter(p->p_lock);
1730b07ec3fcSad while ((sig = issignal(l)) != 0)
1731b07ec3fcSad postsig(sig);
1732284c2b9aSad mutex_exit(p->p_lock);
17336ed72b5fSad continue;
1734b07ec3fcSad }
1735b07ec3fcSad
1736b07ec3fcSad /*
1737b07ec3fcSad * Core-dump or suspend pending.
1738b07ec3fcSad *
17395ca5a72bSmatt * In case of core dump, suspend ourselves, so that the kernel
17405ca5a72bSmatt * stack and therefore the userland registers saved in the
17415ca5a72bSmatt * trapframe are around for coredump() to write them out.
17425ca5a72bSmatt * We also need to save any PCU resources that we have so that
17435ca5a72bSmatt * they accessible for coredump(). We issue a wakeup on
17445ca5a72bSmatt * p->p_lwpcv so that sigexit() will write the core file out
17455ca5a72bSmatt * once all other LWPs are suspended.
1746b07ec3fcSad */
17476ed72b5fSad if ((f & LW_WSUSPEND) != 0) {
17485ca5a72bSmatt pcu_save_all(l);
1749284c2b9aSad mutex_enter(p->p_lock);
1750b07ec3fcSad p->p_nrlwps--;
1751b07ec3fcSad lwp_lock(l);
1752b07ec3fcSad l->l_stat = LSSUSPENDED;
17533cef7381Sad lwp_unlock(l);
1754284c2b9aSad mutex_exit(p->p_lock);
17553d1cabfdSad cv_broadcast(&p->p_lwpcv);
17563cef7381Sad lwp_lock(l);
17574477d28dSad spc_lock(l->l_cpu);
1758f0301095Syamt mi_switch(l);
17596ed72b5fSad continue;
1760b07ec3fcSad }
1761b07ec3fcSad
17626ed72b5fSad /*
17636ed72b5fSad * Process is exiting. The core dump and signal cases must
17646ed72b5fSad * be handled first.
17656ed72b5fSad */
17666ed72b5fSad if ((f & LW_WEXIT) != 0) {
1767b07ec3fcSad lwp_exit(l);
1768b07ec3fcSad KASSERT(0);
1769b07ec3fcSad /* NOTREACHED */
1770b07ec3fcSad }
1771e820d9feSpooka
17726ed72b5fSad /*
17736ed72b5fSad * Update lwpctl processor (for vfork child_return).
17746ed72b5fSad */
17756ed72b5fSad if ((f & LW_LWPCTL) != 0) {
1776e820d9feSpooka lwp_lock(l);
1777e820d9feSpooka KASSERT(kpreempt_disabled());
1778e820d9feSpooka l->l_lwpctl->lc_curcpu = (int)cpu_index(l->l_cpu);
1779e820d9feSpooka l->l_lwpctl->lc_pctr++;
1780e820d9feSpooka l->l_flag &= ~LW_LWPCTL;
1781e820d9feSpooka lwp_unlock(l);
17826ed72b5fSad continue;
1783e820d9feSpooka }
1784b07ec3fcSad }
1785b07ec3fcSad }
1786b07ec3fcSad
1787b07ec3fcSad /*
1788b07ec3fcSad * Force an LWP to enter the kernel, to take a trip through lwp_userret().
1789b07ec3fcSad */
1790b07ec3fcSad void
lwp_need_userret(struct lwp * l)1791b07ec3fcSad lwp_need_userret(struct lwp *l)
1792b07ec3fcSad {
1793e57dd2baSad
1794e57dd2baSad KASSERT(!cpu_intr_p());
1795725adb2aSad KASSERT(lwp_locked(l, NULL) || l->l_stat == LSIDL);
1796b07ec3fcSad
1797b07ec3fcSad /*
1798e57dd2baSad * If the LWP is in any state other than LSONPROC, we know that it
1799e57dd2baSad * is executing in-kernel and will hit userret() on the way out.
1800e57dd2baSad *
1801e57dd2baSad * If the LWP is curlwp, then we know we'll be back out to userspace
1802e57dd2baSad * soon (can't be called from a hardware interrupt here).
1803e57dd2baSad *
1804e57dd2baSad * Otherwise, we can't be sure what the LWP is doing, so first make
1805e57dd2baSad * sure the update to l_flag will be globally visible, and then
1806e57dd2baSad * force the LWP to take a trip through trap() where it will do
1807e57dd2baSad * userret().
1808b07ec3fcSad */
1809e57dd2baSad if (l->l_stat == LSONPROC && l != curlwp) {
1810e2aaefb8Sad membar_producer();
1811b07ec3fcSad cpu_signotify(l);
1812b07ec3fcSad }
1813e57dd2baSad }
1814b07ec3fcSad
1815b07ec3fcSad /*
1816b07ec3fcSad * Add one reference to an LWP. This will prevent the LWP from
1817b07ec3fcSad * exiting, thus keep the lwp structure and PCB around to inspect.
1818b07ec3fcSad */
1819b07ec3fcSad void
lwp_addref(struct lwp * l)1820b07ec3fcSad lwp_addref(struct lwp *l)
1821b07ec3fcSad {
1822284c2b9aSad KASSERT(mutex_owned(l->l_proc->p_lock));
182359150873Sthorpej KASSERT(l->l_stat != LSZOMB);
182459150873Sthorpej l->l_refcnt++;
1825b07ec3fcSad }
1826b07ec3fcSad
1827b07ec3fcSad /*
1828b07ec3fcSad * Remove one reference to an LWP. If this is the last reference,
1829b07ec3fcSad * then we must finalize the LWP's death.
1830b07ec3fcSad */
1831b07ec3fcSad void
lwp_delref(struct lwp * l)1832b07ec3fcSad lwp_delref(struct lwp *l)
1833b07ec3fcSad {
1834b07ec3fcSad struct proc *p = l->l_proc;
1835b07ec3fcSad
1836284c2b9aSad mutex_enter(p->p_lock);
1837ca843a73Schristos lwp_delref2(l);
1838ca843a73Schristos mutex_exit(p->p_lock);
1839ca843a73Schristos }
1840ca843a73Schristos
1841ca843a73Schristos /*
1842ca843a73Schristos * Remove one reference to an LWP. If this is the last reference,
1843ca843a73Schristos * then we must finalize the LWP's death. The proc mutex is held
1844ca843a73Schristos * on entry.
1845ca843a73Schristos */
1846ca843a73Schristos void
lwp_delref2(struct lwp * l)1847ca843a73Schristos lwp_delref2(struct lwp *l)
1848ca843a73Schristos {
1849ca843a73Schristos struct proc *p = l->l_proc;
1850ca843a73Schristos
1851ca843a73Schristos KASSERT(mutex_owned(p->p_lock));
18520a0689eeSad KASSERT(l->l_stat != LSZOMB);
185359150873Sthorpej KASSERT(l->l_refcnt > 0);
18546a317a61Sad
185559150873Sthorpej if (--l->l_refcnt == 0)
1856d18c6ca4Sad cv_broadcast(&p->p_lwpcv);
1857b07ec3fcSad }
1858b07ec3fcSad
1859b07ec3fcSad /*
186098a9cebbSthorpej * Drain all references to the current LWP. Returns true if
186198a9cebbSthorpej * we blocked.
1862b07ec3fcSad */
186398a9cebbSthorpej bool
lwp_drainrefs(struct lwp * l)1864b07ec3fcSad lwp_drainrefs(struct lwp *l)
1865b07ec3fcSad {
1866b07ec3fcSad struct proc *p = l->l_proc;
186798a9cebbSthorpej bool rv = false;
1868b07ec3fcSad
1869284c2b9aSad KASSERT(mutex_owned(p->p_lock));
1870b07ec3fcSad
187198a9cebbSthorpej l->l_prflag |= LPR_DRAINING;
187298a9cebbSthorpej
187359150873Sthorpej while (l->l_refcnt > 0) {
187498a9cebbSthorpej rv = true;
1875284c2b9aSad cv_wait(&p->p_lwpcv, p->p_lock);
18762b79369cSad }
187798a9cebbSthorpej return rv;
187898a9cebbSthorpej }
187904e486d9Sthorpej
188004e486d9Sthorpej /*
18813c323631Sad * Return true if the specified LWP is 'alive'. Only p->p_lock need
18823c323631Sad * be held.
18833c323631Sad */
18843c323631Sad bool
lwp_alive(lwp_t * l)18853c323631Sad lwp_alive(lwp_t *l)
18863c323631Sad {
18873c323631Sad
18883c323631Sad KASSERT(mutex_owned(l->l_proc->p_lock));
18893c323631Sad
18903c323631Sad switch (l->l_stat) {
18913c323631Sad case LSSLEEP:
18923c323631Sad case LSRUN:
18933c323631Sad case LSONPROC:
18943c323631Sad case LSSTOP:
18953c323631Sad case LSSUSPENDED:
18963c323631Sad return true;
18973c323631Sad default:
18983c323631Sad return false;
18993c323631Sad }
19003c323631Sad }
19013c323631Sad
19023c323631Sad /*
19033c323631Sad * Return first live LWP in the process.
19043c323631Sad */
19053c323631Sad lwp_t *
lwp_find_first(proc_t * p)19063c323631Sad lwp_find_first(proc_t *p)
19073c323631Sad {
19083c323631Sad lwp_t *l;
19093c323631Sad
19103c323631Sad KASSERT(mutex_owned(p->p_lock));
19113c323631Sad
19123c323631Sad LIST_FOREACH(l, &p->p_lwps, l_sibling) {
19133c323631Sad if (lwp_alive(l)) {
19143c323631Sad return l;
19153c323631Sad }
19163c323631Sad }
19173c323631Sad
19183c323631Sad return NULL;
19193c323631Sad }
19203c323631Sad
19213c323631Sad /*
1922b668a9a0Sad * Allocate a new lwpctl structure for a user LWP.
1923b668a9a0Sad */
1924b668a9a0Sad int
lwp_ctl_alloc(vaddr_t * uaddr)1925b668a9a0Sad lwp_ctl_alloc(vaddr_t *uaddr)
1926b668a9a0Sad {
1927b668a9a0Sad lcproc_t *lp;
1928b668a9a0Sad u_int bit, i, offset;
1929b668a9a0Sad struct uvm_object *uao;
1930b668a9a0Sad int error;
1931b668a9a0Sad lcpage_t *lcp;
1932b668a9a0Sad proc_t *p;
1933b668a9a0Sad lwp_t *l;
1934b668a9a0Sad
1935b668a9a0Sad l = curlwp;
1936b668a9a0Sad p = l->l_proc;
1937b668a9a0Sad
1938e820d9feSpooka /* don't allow a vforked process to create lwp ctls */
1939e820d9feSpooka if (p->p_lflag & PL_PPWAIT)
1940e820d9feSpooka return EBUSY;
1941e820d9feSpooka
1942e2aaefb8Sad if (l->l_lcpage != NULL) {
1943e2aaefb8Sad lcp = l->l_lcpage;
1944e2aaefb8Sad *uaddr = lcp->lcp_uaddr + (vaddr_t)l->l_lwpctl - lcp->lcp_kaddr;
1945cb925a94Snjoly return 0;
1946e2aaefb8Sad }
1947b668a9a0Sad
1948b668a9a0Sad /* First time around, allocate header structure for the process. */
1949b668a9a0Sad if ((lp = p->p_lwpctl) == NULL) {
1950b668a9a0Sad lp = kmem_alloc(sizeof(*lp), KM_SLEEP);
1951b668a9a0Sad mutex_init(&lp->lp_lock, MUTEX_DEFAULT, IPL_NONE);
1952b668a9a0Sad lp->lp_uao = NULL;
1953b668a9a0Sad TAILQ_INIT(&lp->lp_pages);
1954284c2b9aSad mutex_enter(p->p_lock);
1955b668a9a0Sad if (p->p_lwpctl == NULL) {
1956b668a9a0Sad p->p_lwpctl = lp;
1957284c2b9aSad mutex_exit(p->p_lock);
1958b668a9a0Sad } else {
1959284c2b9aSad mutex_exit(p->p_lock);
1960b668a9a0Sad mutex_destroy(&lp->lp_lock);
1961b668a9a0Sad kmem_free(lp, sizeof(*lp));
1962b668a9a0Sad lp = p->p_lwpctl;
1963b668a9a0Sad }
1964b668a9a0Sad }
1965b668a9a0Sad
1966b668a9a0Sad /*
1967b668a9a0Sad * Set up an anonymous memory region to hold the shared pages.
1968b668a9a0Sad * Map them into the process' address space. The user vmspace
1969b668a9a0Sad * gets the first reference on the UAO.
1970b668a9a0Sad */
1971b668a9a0Sad mutex_enter(&lp->lp_lock);
1972b668a9a0Sad if (lp->lp_uao == NULL) {
1973b668a9a0Sad lp->lp_uao = uao_create(LWPCTL_UAREA_SZ, 0);
1974b668a9a0Sad lp->lp_cur = 0;
1975b668a9a0Sad lp->lp_max = LWPCTL_UAREA_SZ;
1976b668a9a0Sad lp->lp_uva = p->p_emul->e_vm_default_addr(p,
197776713fa8Smartin (vaddr_t)p->p_vmspace->vm_daddr, LWPCTL_UAREA_SZ,
197876713fa8Smartin p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN);
1979b668a9a0Sad error = uvm_map(&p->p_vmspace->vm_map, &lp->lp_uva,
1980b668a9a0Sad LWPCTL_UAREA_SZ, lp->lp_uao, 0, 0, UVM_MAPFLAG(UVM_PROT_RW,
1981b668a9a0Sad UVM_PROT_RW, UVM_INH_NONE, UVM_ADV_NORMAL, 0));
1982b668a9a0Sad if (error != 0) {
1983b668a9a0Sad uao_detach(lp->lp_uao);
1984b668a9a0Sad lp->lp_uao = NULL;
1985b668a9a0Sad mutex_exit(&lp->lp_lock);
1986b668a9a0Sad return error;
1987b668a9a0Sad }
1988b668a9a0Sad }
1989b668a9a0Sad
1990b668a9a0Sad /* Get a free block and allocate for this LWP. */
1991b668a9a0Sad TAILQ_FOREACH(lcp, &lp->lp_pages, lcp_chain) {
1992b668a9a0Sad if (lcp->lcp_nfree != 0)
1993b668a9a0Sad break;
1994b668a9a0Sad }
1995b668a9a0Sad if (lcp == NULL) {
1996b668a9a0Sad /* Nothing available - try to set up a free page. */
1997b668a9a0Sad if (lp->lp_cur == lp->lp_max) {
1998b668a9a0Sad mutex_exit(&lp->lp_lock);
1999b668a9a0Sad return ENOMEM;
2000b668a9a0Sad }
2001b668a9a0Sad lcp = kmem_alloc(LWPCTL_LCPAGE_SZ, KM_SLEEP);
2002fd34ea77Schs
2003b668a9a0Sad /*
2004b668a9a0Sad * Wire the next page down in kernel space. Since this
2005b668a9a0Sad * is a new mapping, we must add a reference.
2006b668a9a0Sad */
2007b668a9a0Sad uao = lp->lp_uao;
2008b668a9a0Sad (*uao->pgops->pgo_reference)(uao);
200925b10dbbSad lcp->lcp_kaddr = vm_map_min(kernel_map);
2010b668a9a0Sad error = uvm_map(kernel_map, &lcp->lcp_kaddr, PAGE_SIZE,
2011b668a9a0Sad uao, lp->lp_cur, PAGE_SIZE,
2012b668a9a0Sad UVM_MAPFLAG(UVM_PROT_RW, UVM_PROT_RW,
2013b668a9a0Sad UVM_INH_NONE, UVM_ADV_RANDOM, 0));
2014b668a9a0Sad if (error != 0) {
2015b668a9a0Sad mutex_exit(&lp->lp_lock);
2016b668a9a0Sad kmem_free(lcp, LWPCTL_LCPAGE_SZ);
2017b668a9a0Sad (*uao->pgops->pgo_detach)(uao);
2018b668a9a0Sad return error;
2019b668a9a0Sad }
20202e8a5beeSyamt error = uvm_map_pageable(kernel_map, lcp->lcp_kaddr,
20212e8a5beeSyamt lcp->lcp_kaddr + PAGE_SIZE, FALSE, 0);
20222e8a5beeSyamt if (error != 0) {
20232e8a5beeSyamt mutex_exit(&lp->lp_lock);
20242e8a5beeSyamt uvm_unmap(kernel_map, lcp->lcp_kaddr,
20252e8a5beeSyamt lcp->lcp_kaddr + PAGE_SIZE);
20262e8a5beeSyamt kmem_free(lcp, LWPCTL_LCPAGE_SZ);
20272e8a5beeSyamt return error;
20282e8a5beeSyamt }
2029b668a9a0Sad /* Prepare the page descriptor and link into the list. */
2030b668a9a0Sad lcp->lcp_uaddr = lp->lp_uva + lp->lp_cur;
2031b668a9a0Sad lp->lp_cur += PAGE_SIZE;
2032b668a9a0Sad lcp->lcp_nfree = LWPCTL_PER_PAGE;
2033b668a9a0Sad lcp->lcp_rotor = 0;
2034b668a9a0Sad memset(lcp->lcp_bitmap, 0xff, LWPCTL_BITMAP_SZ);
2035b668a9a0Sad TAILQ_INSERT_HEAD(&lp->lp_pages, lcp, lcp_chain);
2036b668a9a0Sad }
2037b668a9a0Sad for (i = lcp->lcp_rotor; lcp->lcp_bitmap[i] == 0;) {
2038b668a9a0Sad if (++i >= LWPCTL_BITMAP_ENTRIES)
2039b668a9a0Sad i = 0;
2040b668a9a0Sad }
2041b668a9a0Sad bit = ffs(lcp->lcp_bitmap[i]) - 1;
204285b6812cSkamil lcp->lcp_bitmap[i] ^= (1U << bit);
2043b668a9a0Sad lcp->lcp_rotor = i;
2044b668a9a0Sad lcp->lcp_nfree--;
2045b668a9a0Sad l->l_lcpage = lcp;
2046b668a9a0Sad offset = (i << 5) + bit;
2047b668a9a0Sad l->l_lwpctl = (lwpctl_t *)lcp->lcp_kaddr + offset;
2048b668a9a0Sad *uaddr = lcp->lcp_uaddr + offset * sizeof(lwpctl_t);
2049b668a9a0Sad mutex_exit(&lp->lp_lock);
2050b668a9a0Sad
20514c7ba244Sad KPREEMPT_DISABLE(l);
2052c8304d84Sskrll l->l_lwpctl->lc_curcpu = (int)cpu_index(curcpu());
20534c7ba244Sad KPREEMPT_ENABLE(l);
2054b668a9a0Sad
2055b668a9a0Sad return 0;
2056b668a9a0Sad }
2057b668a9a0Sad
2058b668a9a0Sad /*
2059b668a9a0Sad * Free an lwpctl structure back to the per-process list.
2060b668a9a0Sad */
2061b668a9a0Sad void
lwp_ctl_free(lwp_t * l)2062b668a9a0Sad lwp_ctl_free(lwp_t *l)
2063b668a9a0Sad {
2064e820d9feSpooka struct proc *p = l->l_proc;
2065b668a9a0Sad lcproc_t *lp;
2066b668a9a0Sad lcpage_t *lcp;
2067b668a9a0Sad u_int map, offset;
2068b668a9a0Sad
2069e820d9feSpooka /* don't free a lwp context we borrowed for vfork */
2070e820d9feSpooka if (p->p_lflag & PL_PPWAIT) {
2071e820d9feSpooka l->l_lwpctl = NULL;
2072e820d9feSpooka return;
2073e820d9feSpooka }
2074e820d9feSpooka
2075e820d9feSpooka lp = p->p_lwpctl;
2076b668a9a0Sad KASSERT(lp != NULL);
2077b668a9a0Sad
2078b668a9a0Sad lcp = l->l_lcpage;
2079b668a9a0Sad offset = (u_int)((lwpctl_t *)l->l_lwpctl - (lwpctl_t *)lcp->lcp_kaddr);
2080b668a9a0Sad KASSERT(offset < LWPCTL_PER_PAGE);
2081b668a9a0Sad
2082b668a9a0Sad mutex_enter(&lp->lp_lock);
2083b668a9a0Sad lcp->lcp_nfree++;
2084b668a9a0Sad map = offset >> 5;
20851d52842dSkamil lcp->lcp_bitmap[map] |= (1U << (offset & 31));
2086b668a9a0Sad if (lcp->lcp_bitmap[lcp->lcp_rotor] == 0)
2087b668a9a0Sad lcp->lcp_rotor = map;
2088b668a9a0Sad if (TAILQ_FIRST(&lp->lp_pages)->lcp_nfree == 0) {
2089b668a9a0Sad TAILQ_REMOVE(&lp->lp_pages, lcp, lcp_chain);
2090b668a9a0Sad TAILQ_INSERT_HEAD(&lp->lp_pages, lcp, lcp_chain);
2091b668a9a0Sad }
2092b668a9a0Sad mutex_exit(&lp->lp_lock);
2093b668a9a0Sad }
2094b668a9a0Sad
2095b668a9a0Sad /*
2096b668a9a0Sad * Process is exiting; tear down lwpctl state. This can only be safely
2097b668a9a0Sad * called by the last LWP in the process.
2098b668a9a0Sad */
2099b668a9a0Sad void
lwp_ctl_exit(void)2100b668a9a0Sad lwp_ctl_exit(void)
2101b668a9a0Sad {
2102b668a9a0Sad lcpage_t *lcp, *next;
2103b668a9a0Sad lcproc_t *lp;
2104b668a9a0Sad proc_t *p;
2105b668a9a0Sad lwp_t *l;
2106b668a9a0Sad
2107b668a9a0Sad l = curlwp;
2108b668a9a0Sad l->l_lwpctl = NULL;
210932b8f98eSad l->l_lcpage = NULL;
2110b668a9a0Sad p = l->l_proc;
2111b668a9a0Sad lp = p->p_lwpctl;
2112b668a9a0Sad
2113b668a9a0Sad KASSERT(lp != NULL);
2114b668a9a0Sad KASSERT(p->p_nlwps == 1);
2115b668a9a0Sad
2116b668a9a0Sad for (lcp = TAILQ_FIRST(&lp->lp_pages); lcp != NULL; lcp = next) {
2117b668a9a0Sad next = TAILQ_NEXT(lcp, lcp_chain);
2118b668a9a0Sad uvm_unmap(kernel_map, lcp->lcp_kaddr,
2119b668a9a0Sad lcp->lcp_kaddr + PAGE_SIZE);
2120b668a9a0Sad kmem_free(lcp, LWPCTL_LCPAGE_SZ);
2121b668a9a0Sad }
2122b668a9a0Sad
2123b668a9a0Sad if (lp->lp_uao != NULL) {
2124b668a9a0Sad uvm_unmap(&p->p_vmspace->vm_map, lp->lp_uva,
2125b668a9a0Sad lp->lp_uva + LWPCTL_UAREA_SZ);
2126b668a9a0Sad }
2127b668a9a0Sad
2128b668a9a0Sad mutex_destroy(&lp->lp_lock);
2129b668a9a0Sad kmem_free(lp, sizeof(*lp));
2130b668a9a0Sad p->p_lwpctl = NULL;
2131b668a9a0Sad }
21320c382013Syamt
2133f0545a5eSad /*
2134f0545a5eSad * Return the current LWP's "preemption counter". Used to detect
2135f0545a5eSad * preemption across operations that can tolerate preemption without
2136f0545a5eSad * crashing, but which may generate incorrect results if preempted.
2137f103f77aSriastradh *
2138f103f77aSriastradh * We do arithmetic in unsigned long to avoid undefined behaviour in
2139f103f77aSriastradh * the event of arithmetic overflow on LP32, and issue __insn_barrier()
2140f103f77aSriastradh * on both sides so this can safely be used to detect changes to the
2141f103f77aSriastradh * preemption counter in loops around other memory accesses even in the
2142f103f77aSriastradh * event of whole-program optimization (e.g., gcc -flto).
2143f0545a5eSad */
2144a355028fSad long
lwp_pctr(void)2145f0545a5eSad lwp_pctr(void)
2146f0545a5eSad {
2147f103f77aSriastradh unsigned long pctr;
2148f0545a5eSad
2149f103f77aSriastradh __insn_barrier();
2150f103f77aSriastradh pctr = curlwp->l_ru.ru_nvcsw;
2151f103f77aSriastradh pctr += curlwp->l_ru.ru_nivcsw;
2152f103f77aSriastradh __insn_barrier();
2153f103f77aSriastradh return pctr;
2154f0545a5eSad }
2155f0545a5eSad
215633fa5ccbSchs /*
215733fa5ccbSchs * Set an LWP's private data pointer.
215833fa5ccbSchs */
215933fa5ccbSchs int
lwp_setprivate(struct lwp * l,void * ptr)216033fa5ccbSchs lwp_setprivate(struct lwp *l, void *ptr)
216133fa5ccbSchs {
216233fa5ccbSchs int error = 0;
216333fa5ccbSchs
216433fa5ccbSchs l->l_private = ptr;
216533fa5ccbSchs #ifdef __HAVE_CPU_LWP_SETPRIVATE
216633fa5ccbSchs error = cpu_lwp_setprivate(l, ptr);
216733fa5ccbSchs #endif
216833fa5ccbSchs return error;
216933fa5ccbSchs }
217033fa5ccbSchs
217198a9cebbSthorpej /*
217298a9cebbSthorpej * Perform any thread-related cleanup on LWP exit.
217398a9cebbSthorpej * N.B. l->l_proc->p_lock must be HELD on entry but will
217498a9cebbSthorpej * be released before returning!
217598a9cebbSthorpej */
217698a9cebbSthorpej void
lwp_thread_cleanup(struct lwp * l)217798a9cebbSthorpej lwp_thread_cleanup(struct lwp *l)
217898a9cebbSthorpej {
217998a9cebbSthorpej
218098a9cebbSthorpej KASSERT(mutex_owned(l->l_proc->p_lock));
218198a9cebbSthorpej mutex_exit(l->l_proc->p_lock);
2182276ef223Sthorpej
2183276ef223Sthorpej /*
2184276ef223Sthorpej * If the LWP has robust futexes, release them all
2185276ef223Sthorpej * now.
2186276ef223Sthorpej */
2187276ef223Sthorpej if (__predict_false(l->l_robust_head != 0)) {
2188978ef622Sthorpej futex_release_all_lwp(l);
2189276ef223Sthorpej }
219098a9cebbSthorpej }
219198a9cebbSthorpej
21920c382013Syamt #if defined(DDB)
21937146b2f6Srmind #include <machine/pcb.h>
21947146b2f6Srmind
21950c382013Syamt void
lwp_whatis(uintptr_t addr,void (* pr)(const char *,...))21960c382013Syamt lwp_whatis(uintptr_t addr, void (*pr)(const char *, ...))
21970c382013Syamt {
21980c382013Syamt lwp_t *l;
21990c382013Syamt
22000c382013Syamt LIST_FOREACH(l, &alllwp, l_list) {
22010c382013Syamt uintptr_t stack = (uintptr_t)KSTACK_LOWEST_ADDR(l);
22020c382013Syamt
22030c382013Syamt if (addr < stack || stack + KSTACK_SIZE <= addr) {
22040c382013Syamt continue;
22050c382013Syamt }
22060c382013Syamt (*pr)("%p is %p+%zu, LWP %p's stack\n",
22070c382013Syamt (void *)addr, (void *)stack,
22080c382013Syamt (size_t)(addr - stack), l);
22090c382013Syamt }
22100c382013Syamt }
22110c382013Syamt #endif /* defined(DDB) */
2212