xref: /freebsd-src/sys/kern/subr_epoch.c (revision fdafd315ad0d0f28a11b9fb4476a9ab059c62b92)
106bf2a6aSMatt Macy /*-
2*4d846d26SWarner Losh  * SPDX-License-Identifier: BSD-2-Clause
31f4beb63SMatt Macy  *
406bf2a6aSMatt Macy  * Copyright (c) 2018, Matthew Macy <mmacy@freebsd.org>
506bf2a6aSMatt Macy  *
606bf2a6aSMatt Macy  * Redistribution and use in source and binary forms, with or without
71f4beb63SMatt Macy  * modification, are permitted provided that the following conditions
81f4beb63SMatt Macy  * are met:
91f4beb63SMatt Macy  * 1. Redistributions of source code must retain the above copyright
101f4beb63SMatt Macy  *    notice, this list of conditions and the following disclaimer.
111f4beb63SMatt Macy  * 2. Redistributions in binary form must reproduce the above copyright
121f4beb63SMatt Macy  *    notice, this list of conditions and the following disclaimer in the
131f4beb63SMatt Macy  *    documentation and/or other materials provided with the distribution.
1406bf2a6aSMatt Macy  *
151f4beb63SMatt Macy  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
161f4beb63SMatt Macy  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1706bf2a6aSMatt Macy  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
181f4beb63SMatt Macy  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
191f4beb63SMatt Macy  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
201f4beb63SMatt Macy  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
211f4beb63SMatt Macy  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
221f4beb63SMatt Macy  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
231f4beb63SMatt Macy  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
241f4beb63SMatt Macy  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
251f4beb63SMatt Macy  * SUCH DAMAGE.
261f4beb63SMatt Macy  *
2706bf2a6aSMatt Macy  */
2806bf2a6aSMatt Macy 
2906bf2a6aSMatt Macy #include <sys/param.h>
3006bf2a6aSMatt Macy #include <sys/systm.h>
3106bf2a6aSMatt Macy #include <sys/counter.h>
3206bf2a6aSMatt Macy #include <sys/epoch.h>
3306bf2a6aSMatt Macy #include <sys/gtaskqueue.h>
3406bf2a6aSMatt Macy #include <sys/kernel.h>
3506bf2a6aSMatt Macy #include <sys/limits.h>
3606bf2a6aSMatt Macy #include <sys/lock.h>
3706bf2a6aSMatt Macy #include <sys/malloc.h>
3806bf2a6aSMatt Macy #include <sys/mutex.h>
39c4d901e9SMatt Macy #include <sys/pcpu.h>
4006bf2a6aSMatt Macy #include <sys/proc.h>
4106bf2a6aSMatt Macy #include <sys/sched.h>
42131b2b76SHans Petter Selasky #include <sys/sx.h>
4306bf2a6aSMatt Macy #include <sys/smp.h>
4406bf2a6aSMatt Macy #include <sys/sysctl.h>
4506bf2a6aSMatt Macy #include <sys/turnstile.h>
46dd902d01SGleb Smirnoff #ifdef EPOCH_TRACE
47dd902d01SGleb Smirnoff #include <machine/stdarg.h>
48dd902d01SGleb Smirnoff #include <sys/stack.h>
49dd902d01SGleb Smirnoff #include <sys/tree.h>
50dd902d01SGleb Smirnoff #endif
5106bf2a6aSMatt Macy #include <vm/vm.h>
5206bf2a6aSMatt Macy #include <vm/vm_extern.h>
5306bf2a6aSMatt Macy #include <vm/vm_kern.h>
54822e50e3SMatt Macy #include <vm/uma.h>
5506bf2a6aSMatt Macy 
562555f175SKonstantin Belousov #include <machine/stack.h>
572555f175SKonstantin Belousov 
5806bf2a6aSMatt Macy #include <ck_epoch.h>
5906bf2a6aSMatt Macy 
60a82296c2SGleb Smirnoff #ifdef __amd64__
61a82296c2SGleb Smirnoff #define EPOCH_ALIGN CACHE_LINE_SIZE*2
62a82296c2SGleb Smirnoff #else
63a82296c2SGleb Smirnoff #define EPOCH_ALIGN CACHE_LINE_SIZE
64a82296c2SGleb Smirnoff #endif
65a82296c2SGleb Smirnoff 
669f360eecSGleb Smirnoff TAILQ_HEAD (epoch_tdlist, epoch_tracker);
67a82296c2SGleb Smirnoff typedef struct epoch_record {
6891cf4975SMatt Macy 	ck_epoch_record_t er_record;
69131b2b76SHans Petter Selasky 	struct epoch_context er_drain_ctx;
70131b2b76SHans Petter Selasky 	struct epoch *er_parent;
71a82296c2SGleb Smirnoff 	volatile struct epoch_tdlist er_tdlist;
72a82296c2SGleb Smirnoff 	volatile uint32_t er_gen;
73a82296c2SGleb Smirnoff 	uint32_t er_cpuid;
747667824aSKyle Evans #ifdef INVARIANTS
757667824aSKyle Evans 	/* Used to verify record ownership for non-preemptible epochs. */
767667824aSKyle Evans 	struct thread *er_td;
777667824aSKyle Evans #endif
78a82296c2SGleb Smirnoff } __aligned(EPOCH_ALIGN)     *epoch_record_t;
79a82296c2SGleb Smirnoff 
80a82296c2SGleb Smirnoff struct epoch {
81a82296c2SGleb Smirnoff 	struct ck_epoch e_epoch __aligned(EPOCH_ALIGN);
82a82296c2SGleb Smirnoff 	epoch_record_t e_pcpu_record;
83826c0793SHans Petter Selasky 	int	e_in_use;
84a82296c2SGleb Smirnoff 	int	e_flags;
85131b2b76SHans Petter Selasky 	struct sx e_drain_sx;
86131b2b76SHans Petter Selasky 	struct mtx e_drain_mtx;
87131b2b76SHans Petter Selasky 	volatile int e_drain_count;
88dd902d01SGleb Smirnoff 	const char *e_name;
89a82296c2SGleb Smirnoff };
90a82296c2SGleb Smirnoff 
9106bf2a6aSMatt Macy /* arbitrary --- needs benchmarking */
929fec45d8SMatt Macy #define MAX_ADAPTIVE_SPIN 100
93c4d901e9SMatt Macy #define MAX_EPOCHS 64
9406bf2a6aSMatt Macy 
952a45e828SMatt Macy CTASSERT(sizeof(ck_epoch_entry_t) == sizeof(struct epoch_context));
967029da5cSPawel Biernacki SYSCTL_NODE(_kern, OID_AUTO, epoch, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
977029da5cSPawel Biernacki     "epoch information");
987029da5cSPawel Biernacki SYSCTL_NODE(_kern_epoch, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
997029da5cSPawel Biernacki     "epoch stats");
10006bf2a6aSMatt Macy 
10106bf2a6aSMatt Macy /* Stats. */
10206bf2a6aSMatt Macy static counter_u64_t block_count;
103e445381fSMatt Macy 
10406bf2a6aSMatt Macy SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, nblocked, CTLFLAG_RW,
10506bf2a6aSMatt Macy     &block_count, "# of times a thread was in an epoch when epoch_wait was called");
10606bf2a6aSMatt Macy static counter_u64_t migrate_count;
107e445381fSMatt Macy 
10806bf2a6aSMatt Macy SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, migrations, CTLFLAG_RW,
10906bf2a6aSMatt Macy     &migrate_count, "# of times thread was migrated to another CPU in epoch_wait");
11006bf2a6aSMatt Macy static counter_u64_t turnstile_count;
111e445381fSMatt Macy 
11206bf2a6aSMatt Macy SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, ncontended, CTLFLAG_RW,
11306bf2a6aSMatt Macy     &turnstile_count, "# of times a thread was blocked on a lock in an epoch during an epoch_wait");
11406bf2a6aSMatt Macy static counter_u64_t switch_count;
115e445381fSMatt Macy 
11606bf2a6aSMatt Macy SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, switches, CTLFLAG_RW,
11706bf2a6aSMatt Macy     &switch_count, "# of times a thread voluntarily context switched in epoch_wait");
1185e68a3dfSMatt Macy static counter_u64_t epoch_call_count;
119e445381fSMatt Macy 
1205e68a3dfSMatt Macy SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, epoch_calls, CTLFLAG_RW,
1215e68a3dfSMatt Macy     &epoch_call_count, "# of times a callback was deferred");
1225e68a3dfSMatt Macy static counter_u64_t epoch_call_task_count;
123e445381fSMatt Macy 
1245e68a3dfSMatt Macy SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, epoch_call_tasks, CTLFLAG_RW,
1255e68a3dfSMatt Macy     &epoch_call_task_count, "# of times a callback task was run");
12606bf2a6aSMatt Macy 
12706bf2a6aSMatt Macy TAILQ_HEAD (threadlist, thread);
12806bf2a6aSMatt Macy 
1292a45e828SMatt Macy CK_STACK_CONTAINER(struct ck_epoch_entry, stack_entry,
1302a45e828SMatt Macy     ck_epoch_entry_container)
13106bf2a6aSMatt Macy 
132826c0793SHans Petter Selasky static struct epoch epoch_array[MAX_EPOCHS];
133c4d901e9SMatt Macy 
134a5f10424SMatt Macy DPCPU_DEFINE(struct grouptask, epoch_cb_task);
135a5f10424SMatt Macy DPCPU_DEFINE(int, epoch_cb_count);
136c4d901e9SMatt Macy 
13706bf2a6aSMatt Macy static __read_mostly int inited;
1381f4beb63SMatt Macy __read_mostly epoch_t global_epoch;
13970398c2fSMatt Macy __read_mostly epoch_t global_epoch_preempt;
14006bf2a6aSMatt Macy 
141c4d901e9SMatt Macy static void epoch_call_task(void *context __unused);
142822e50e3SMatt Macy static 	uma_zone_t pcpu_zone_record;
14306bf2a6aSMatt Macy 
144826c0793SHans Petter Selasky static struct sx epoch_sx;
145826c0793SHans Petter Selasky 
146826c0793SHans Petter Selasky #define	EPOCH_LOCK() sx_xlock(&epoch_sx)
147826c0793SHans Petter Selasky #define	EPOCH_UNLOCK() sx_xunlock(&epoch_sx)
148826c0793SHans Petter Selasky 
149db0ac6deSCy Schubert static epoch_record_t
epoch_currecord(epoch_t epoch)150db0ac6deSCy Schubert epoch_currecord(epoch_t epoch)
151db0ac6deSCy Schubert {
152db0ac6deSCy Schubert 
153db0ac6deSCy Schubert 	return (zpcpu_get(epoch->e_pcpu_record));
154db0ac6deSCy Schubert }
155db0ac6deSCy Schubert 
156dd902d01SGleb Smirnoff #ifdef EPOCH_TRACE
157dd902d01SGleb Smirnoff struct stackentry {
158dd902d01SGleb Smirnoff 	RB_ENTRY(stackentry) se_node;
159dd902d01SGleb Smirnoff 	struct stack se_stack;
160dd902d01SGleb Smirnoff };
161dd902d01SGleb Smirnoff 
162dd902d01SGleb Smirnoff static int
stackentry_compare(struct stackentry * a,struct stackentry * b)163dd902d01SGleb Smirnoff stackentry_compare(struct stackentry *a, struct stackentry *b)
164dd902d01SGleb Smirnoff {
165dd902d01SGleb Smirnoff 
166dd902d01SGleb Smirnoff 	if (a->se_stack.depth > b->se_stack.depth)
167dd902d01SGleb Smirnoff 		return (1);
168dd902d01SGleb Smirnoff 	if (a->se_stack.depth < b->se_stack.depth)
169dd902d01SGleb Smirnoff 		return (-1);
170dd902d01SGleb Smirnoff 	for (int i = 0; i < a->se_stack.depth; i++) {
171dd902d01SGleb Smirnoff 		if (a->se_stack.pcs[i] > b->se_stack.pcs[i])
172dd902d01SGleb Smirnoff 			return (1);
173dd902d01SGleb Smirnoff 		if (a->se_stack.pcs[i] < b->se_stack.pcs[i])
174dd902d01SGleb Smirnoff 			return (-1);
175dd902d01SGleb Smirnoff 	}
176dd902d01SGleb Smirnoff 
177dd902d01SGleb Smirnoff 	return (0);
178dd902d01SGleb Smirnoff }
179dd902d01SGleb Smirnoff 
180dd902d01SGleb Smirnoff RB_HEAD(stacktree, stackentry) epoch_stacks = RB_INITIALIZER(&epoch_stacks);
181dd902d01SGleb Smirnoff RB_GENERATE_STATIC(stacktree, stackentry, se_node, stackentry_compare);
182dd902d01SGleb Smirnoff 
183dd902d01SGleb Smirnoff static struct mtx epoch_stacks_lock;
184dd902d01SGleb Smirnoff MTX_SYSINIT(epochstacks, &epoch_stacks_lock, "epoch_stacks", MTX_DEF);
185dd902d01SGleb Smirnoff 
186173c062aSBjoern A. Zeeb static bool epoch_trace_stack_print = true;
187173c062aSBjoern A. Zeeb SYSCTL_BOOL(_kern_epoch, OID_AUTO, trace_stack_print, CTLFLAG_RWTUN,
188173c062aSBjoern A. Zeeb     &epoch_trace_stack_print, 0, "Print stack traces on epoch reports");
189173c062aSBjoern A. Zeeb 
190dd902d01SGleb Smirnoff static void epoch_trace_report(const char *fmt, ...) __printflike(1, 2);
191dd902d01SGleb Smirnoff static inline void
epoch_trace_report(const char * fmt,...)192dd902d01SGleb Smirnoff epoch_trace_report(const char *fmt, ...)
193dd902d01SGleb Smirnoff {
194dd902d01SGleb Smirnoff 	va_list ap;
195dd902d01SGleb Smirnoff 	struct stackentry se, *new;
196dd902d01SGleb Smirnoff 
197dd902d01SGleb Smirnoff 	stack_save(&se.se_stack);
198dd902d01SGleb Smirnoff 
199dd902d01SGleb Smirnoff 	/* Tree is never reduced - go lockless. */
200dd902d01SGleb Smirnoff 	if (RB_FIND(stacktree, &epoch_stacks, &se) != NULL)
201dd902d01SGleb Smirnoff 		return;
202dd902d01SGleb Smirnoff 
203dd902d01SGleb Smirnoff 	new = malloc(sizeof(*new), M_STACK, M_NOWAIT);
204dd902d01SGleb Smirnoff 	if (new != NULL) {
205dd902d01SGleb Smirnoff 		bcopy(&se.se_stack, &new->se_stack, sizeof(struct stack));
206dd902d01SGleb Smirnoff 
207dd902d01SGleb Smirnoff 		mtx_lock(&epoch_stacks_lock);
208dd902d01SGleb Smirnoff 		new = RB_INSERT(stacktree, &epoch_stacks, new);
209dd902d01SGleb Smirnoff 		mtx_unlock(&epoch_stacks_lock);
210dd902d01SGleb Smirnoff 		if (new != NULL)
211dd902d01SGleb Smirnoff 			free(new, M_STACK);
212dd902d01SGleb Smirnoff 	}
213dd902d01SGleb Smirnoff 
214dd902d01SGleb Smirnoff 	va_start(ap, fmt);
215dd902d01SGleb Smirnoff 	(void)vprintf(fmt, ap);
216dd902d01SGleb Smirnoff 	va_end(ap);
217173c062aSBjoern A. Zeeb 	if (epoch_trace_stack_print)
218dd902d01SGleb Smirnoff 		stack_print_ddb(&se.se_stack);
219dd902d01SGleb Smirnoff }
220dd902d01SGleb Smirnoff 
221dd902d01SGleb Smirnoff static inline void
epoch_trace_enter(struct thread * td,epoch_t epoch,epoch_tracker_t et,const char * file,int line)222dd902d01SGleb Smirnoff epoch_trace_enter(struct thread *td, epoch_t epoch, epoch_tracker_t et,
223dd902d01SGleb Smirnoff     const char *file, int line)
224dd902d01SGleb Smirnoff {
225dd902d01SGleb Smirnoff 	epoch_tracker_t iet;
226dd902d01SGleb Smirnoff 
227cc9bb7a9SHans Petter Selasky 	SLIST_FOREACH(iet, &td->td_epochs, et_tlink) {
228cc9bb7a9SHans Petter Selasky 		if (iet->et_epoch != epoch)
229cc9bb7a9SHans Petter Selasky 			continue;
230dd902d01SGleb Smirnoff 		epoch_trace_report("Recursively entering epoch %s "
231173c062aSBjoern A. Zeeb 		    "at %s:%d, previously entered at %s:%d\n",
232173c062aSBjoern A. Zeeb 		    epoch->e_name, file, line,
233173c062aSBjoern A. Zeeb 		    iet->et_file, iet->et_line);
234cc9bb7a9SHans Petter Selasky 	}
235dd902d01SGleb Smirnoff 	et->et_epoch = epoch;
236dd902d01SGleb Smirnoff 	et->et_file = file;
237dd902d01SGleb Smirnoff 	et->et_line = line;
238db0ac6deSCy Schubert 	et->et_flags = 0;
239dd902d01SGleb Smirnoff 	SLIST_INSERT_HEAD(&td->td_epochs, et, et_tlink);
240dd902d01SGleb Smirnoff }
241dd902d01SGleb Smirnoff 
242dd902d01SGleb Smirnoff static inline void
epoch_trace_exit(struct thread * td,epoch_t epoch,epoch_tracker_t et,const char * file,int line)243dd902d01SGleb Smirnoff epoch_trace_exit(struct thread *td, epoch_t epoch, epoch_tracker_t et,
244dd902d01SGleb Smirnoff     const char *file, int line)
245dd902d01SGleb Smirnoff {
246dd902d01SGleb Smirnoff 
247dd902d01SGleb Smirnoff 	if (SLIST_FIRST(&td->td_epochs) != et) {
248173c062aSBjoern A. Zeeb 		epoch_trace_report("Exiting epoch %s in a not nested order "
249173c062aSBjoern A. Zeeb 		    "at %s:%d. Most recently entered %s at %s:%d\n",
250dd902d01SGleb Smirnoff 		    epoch->e_name,
251173c062aSBjoern A. Zeeb 		    file, line,
252dd902d01SGleb Smirnoff 		    SLIST_FIRST(&td->td_epochs)->et_epoch->e_name,
253dd902d01SGleb Smirnoff 		    SLIST_FIRST(&td->td_epochs)->et_file,
254dd902d01SGleb Smirnoff 		    SLIST_FIRST(&td->td_epochs)->et_line);
255dd902d01SGleb Smirnoff 		/* This will panic if et is not anywhere on td_epochs. */
256dd902d01SGleb Smirnoff 		SLIST_REMOVE(&td->td_epochs, et, epoch_tracker, et_tlink);
257dd902d01SGleb Smirnoff 	} else
258dd902d01SGleb Smirnoff 		SLIST_REMOVE_HEAD(&td->td_epochs, et_tlink);
259db0ac6deSCy Schubert 	if (et->et_flags & ET_REPORT_EXIT)
260db0ac6deSCy Schubert 		printf("Td %p exiting epoch %s at %s:%d\n", td, epoch->e_name,
261db0ac6deSCy Schubert 		    file, line);
262dd902d01SGleb Smirnoff }
263bac06038SGleb Smirnoff 
264bac06038SGleb Smirnoff /* Used by assertions that check thread state before going to sleep. */
265bac06038SGleb Smirnoff void
epoch_trace_list(struct thread * td)266bac06038SGleb Smirnoff epoch_trace_list(struct thread *td)
267bac06038SGleb Smirnoff {
268bac06038SGleb Smirnoff 	epoch_tracker_t iet;
269bac06038SGleb Smirnoff 
270bac06038SGleb Smirnoff 	SLIST_FOREACH(iet, &td->td_epochs, et_tlink)
271bac06038SGleb Smirnoff 		printf("Epoch %s entered at %s:%d\n", iet->et_epoch->e_name,
272bac06038SGleb Smirnoff 		    iet->et_file, iet->et_line);
273bac06038SGleb Smirnoff }
274db0ac6deSCy Schubert 
275db0ac6deSCy Schubert void
epoch_where_report(epoch_t epoch)276db0ac6deSCy Schubert epoch_where_report(epoch_t epoch)
277db0ac6deSCy Schubert {
278db0ac6deSCy Schubert 	epoch_record_t er;
279db0ac6deSCy Schubert 	struct epoch_tracker *tdwait;
280db0ac6deSCy Schubert 
281db0ac6deSCy Schubert 	MPASS(epoch != NULL);
282db0ac6deSCy Schubert 	MPASS((epoch->e_flags & EPOCH_PREEMPT) != 0);
283db0ac6deSCy Schubert 	MPASS(!THREAD_CAN_SLEEP());
284db0ac6deSCy Schubert 	critical_enter();
285db0ac6deSCy Schubert 	er = epoch_currecord(epoch);
286db0ac6deSCy Schubert 	TAILQ_FOREACH(tdwait, &er->er_tdlist, et_link)
287db0ac6deSCy Schubert 		if (tdwait->et_td == curthread)
288db0ac6deSCy Schubert 			break;
289db0ac6deSCy Schubert 	critical_exit();
290db0ac6deSCy Schubert 	if (tdwait != NULL) {
291db0ac6deSCy Schubert 		tdwait->et_flags |= ET_REPORT_EXIT;
292db0ac6deSCy Schubert 		printf("Td %p entered epoch %s at %s:%d\n", curthread,
293db0ac6deSCy Schubert 		    epoch->e_name, tdwait->et_file, tdwait->et_line);
294db0ac6deSCy Schubert 	}
295db0ac6deSCy Schubert }
296dd902d01SGleb Smirnoff #endif /* EPOCH_TRACE */
297dd902d01SGleb Smirnoff 
29806bf2a6aSMatt Macy static void
epoch_init(void * arg __unused)29906bf2a6aSMatt Macy epoch_init(void *arg __unused)
30006bf2a6aSMatt Macy {
301822e50e3SMatt Macy 	int cpu;
302b2cb2896SMatt Macy 
303b2cb2896SMatt Macy 	block_count = counter_u64_alloc(M_WAITOK);
304b2cb2896SMatt Macy 	migrate_count = counter_u64_alloc(M_WAITOK);
305b2cb2896SMatt Macy 	turnstile_count = counter_u64_alloc(M_WAITOK);
306b2cb2896SMatt Macy 	switch_count = counter_u64_alloc(M_WAITOK);
30760b7b90dSMatt Macy 	epoch_call_count = counter_u64_alloc(M_WAITOK);
30860b7b90dSMatt Macy 	epoch_call_task_count = counter_u64_alloc(M_WAITOK);
30906bf2a6aSMatt Macy 
310635c1884SGleb Smirnoff 	pcpu_zone_record = uma_zcreate("epoch_record pcpu",
311635c1884SGleb Smirnoff 	    sizeof(struct epoch_record), NULL, NULL, NULL, NULL,
312635c1884SGleb Smirnoff 	    UMA_ALIGN_PTR, UMA_ZONE_PCPU);
313c4d901e9SMatt Macy 	CPU_FOREACH(cpu) {
314635c1884SGleb Smirnoff 		GROUPTASK_INIT(DPCPU_ID_PTR(cpu, epoch_cb_task), 0,
315635c1884SGleb Smirnoff 		    epoch_call_task, NULL);
316635c1884SGleb Smirnoff 		taskqgroup_attach_cpu(qgroup_softirq,
317f855ec81SMarius Strobl 		    DPCPU_ID_PTR(cpu, epoch_cb_task), NULL, cpu, NULL, NULL,
318635c1884SGleb Smirnoff 		    "epoch call task");
319c4d901e9SMatt Macy 	}
320f6eccf96SGleb Smirnoff #ifdef EPOCH_TRACE
321dd902d01SGleb Smirnoff 	SLIST_INIT(&thread0.td_epochs);
322f6eccf96SGleb Smirnoff #endif
323826c0793SHans Petter Selasky 	sx_init(&epoch_sx, "epoch-sx");
32406bf2a6aSMatt Macy 	inited = 1;
325dd902d01SGleb Smirnoff 	global_epoch = epoch_alloc("Global", 0);
326dd902d01SGleb Smirnoff 	global_epoch_preempt = epoch_alloc("Global preemptible", EPOCH_PREEMPT);
32706bf2a6aSMatt Macy }
3287993a104SConrad Meyer SYSINIT(epoch, SI_SUB_EPOCH, SI_ORDER_FIRST, epoch_init, NULL);
32906bf2a6aSMatt Macy 
3300bcfb473SMatt Macy #if !defined(EARLY_AP_STARTUP)
3310bcfb473SMatt Macy static void
epoch_init_smp(void * dummy __unused)3320bcfb473SMatt Macy epoch_init_smp(void *dummy __unused)
3330bcfb473SMatt Macy {
3340bcfb473SMatt Macy 	inited = 2;
3350bcfb473SMatt Macy }
3360bcfb473SMatt Macy SYSINIT(epoch_smp, SI_SUB_SMP + 1, SI_ORDER_FIRST, epoch_init_smp, NULL);
3370bcfb473SMatt Macy #endif
3380bcfb473SMatt Macy 
33906bf2a6aSMatt Macy static void
epoch_ctor(epoch_t epoch)340822e50e3SMatt Macy epoch_ctor(epoch_t epoch)
34106bf2a6aSMatt Macy {
34206bf2a6aSMatt Macy 	epoch_record_t er;
343822e50e3SMatt Macy 	int cpu;
34406bf2a6aSMatt Macy 
345822e50e3SMatt Macy 	epoch->e_pcpu_record = uma_zalloc_pcpu(pcpu_zone_record, M_WAITOK);
346822e50e3SMatt Macy 	CPU_FOREACH(cpu) {
347822e50e3SMatt Macy 		er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
348822e50e3SMatt Macy 		bzero(er, sizeof(*er));
34991cf4975SMatt Macy 		ck_epoch_register(&epoch->e_epoch, &er->er_record, NULL);
35006bf2a6aSMatt Macy 		TAILQ_INIT((struct threadlist *)(uintptr_t)&er->er_tdlist);
351822e50e3SMatt Macy 		er->er_cpuid = cpu;
352131b2b76SHans Petter Selasky 		er->er_parent = epoch;
35306bf2a6aSMatt Macy 	}
35406bf2a6aSMatt Macy }
35506bf2a6aSMatt Macy 
356a82296c2SGleb Smirnoff static void
epoch_adjust_prio(struct thread * td,u_char prio)357a82296c2SGleb Smirnoff epoch_adjust_prio(struct thread *td, u_char prio)
358a82296c2SGleb Smirnoff {
359a82296c2SGleb Smirnoff 
360a82296c2SGleb Smirnoff 	thread_lock(td);
361a82296c2SGleb Smirnoff 	sched_prio(td, prio);
362a82296c2SGleb Smirnoff 	thread_unlock(td);
363a82296c2SGleb Smirnoff }
364a82296c2SGleb Smirnoff 
36506bf2a6aSMatt Macy epoch_t
epoch_alloc(const char * name,int flags)366dd902d01SGleb Smirnoff epoch_alloc(const char *name, int flags)
36706bf2a6aSMatt Macy {
36806bf2a6aSMatt Macy 	epoch_t epoch;
369826c0793SHans Petter Selasky 	int i;
370826c0793SHans Petter Selasky 
371826c0793SHans Petter Selasky 	MPASS(name != NULL);
37206bf2a6aSMatt Macy 
37306bf2a6aSMatt Macy 	if (__predict_false(!inited))
37406bf2a6aSMatt Macy 		panic("%s called too early in boot", __func__);
375826c0793SHans Petter Selasky 
376826c0793SHans Petter Selasky 	EPOCH_LOCK();
377826c0793SHans Petter Selasky 
378826c0793SHans Petter Selasky 	/*
379826c0793SHans Petter Selasky 	 * Find a free index in the epoch array. If no free index is
380826c0793SHans Petter Selasky 	 * found, try to use the index after the last one.
381826c0793SHans Petter Selasky 	 */
382826c0793SHans Petter Selasky 	for (i = 0;; i++) {
383826c0793SHans Petter Selasky 		/*
384826c0793SHans Petter Selasky 		 * If too many epochs are currently allocated,
385826c0793SHans Petter Selasky 		 * return NULL.
386826c0793SHans Petter Selasky 		 */
387826c0793SHans Petter Selasky 		if (i == MAX_EPOCHS) {
388826c0793SHans Petter Selasky 			epoch = NULL;
389826c0793SHans Petter Selasky 			goto done;
390826c0793SHans Petter Selasky 		}
391826c0793SHans Petter Selasky 		if (epoch_array[i].e_in_use == 0)
392826c0793SHans Petter Selasky 			break;
393826c0793SHans Petter Selasky 	}
394826c0793SHans Petter Selasky 
395826c0793SHans Petter Selasky 	epoch = epoch_array + i;
39606bf2a6aSMatt Macy 	ck_epoch_init(&epoch->e_epoch);
397822e50e3SMatt Macy 	epoch_ctor(epoch);
3985e68a3dfSMatt Macy 	epoch->e_flags = flags;
399dd902d01SGleb Smirnoff 	epoch->e_name = name;
400131b2b76SHans Petter Selasky 	sx_init(&epoch->e_drain_sx, "epoch-drain-sx");
401131b2b76SHans Petter Selasky 	mtx_init(&epoch->e_drain_mtx, "epoch-drain-mtx", NULL, MTX_DEF);
402826c0793SHans Petter Selasky 
403826c0793SHans Petter Selasky 	/*
404826c0793SHans Petter Selasky 	 * Set e_in_use last, because when this field is set the
405826c0793SHans Petter Selasky 	 * epoch_call_task() function will start scanning this epoch
406826c0793SHans Petter Selasky 	 * structure.
407826c0793SHans Petter Selasky 	 */
408826c0793SHans Petter Selasky 	atomic_store_rel_int(&epoch->e_in_use, 1);
409826c0793SHans Petter Selasky done:
410826c0793SHans Petter Selasky 	EPOCH_UNLOCK();
41106bf2a6aSMatt Macy 	return (epoch);
41206bf2a6aSMatt Macy }
41306bf2a6aSMatt Macy 
41406bf2a6aSMatt Macy void
epoch_free(epoch_t epoch)41506bf2a6aSMatt Macy epoch_free(epoch_t epoch)
41606bf2a6aSMatt Macy {
4177667824aSKyle Evans #ifdef INVARIANTS
4187667824aSKyle Evans 	int cpu;
4197667824aSKyle Evans #endif
42006bf2a6aSMatt Macy 
421826c0793SHans Petter Selasky 	EPOCH_LOCK();
422826c0793SHans Petter Selasky 
423826c0793SHans Petter Selasky 	MPASS(epoch->e_in_use != 0);
424826c0793SHans Petter Selasky 
425131b2b76SHans Petter Selasky 	epoch_drain_callbacks(epoch);
426826c0793SHans Petter Selasky 
427826c0793SHans Petter Selasky 	atomic_store_rel_int(&epoch->e_in_use, 0);
428826c0793SHans Petter Selasky 	/*
429826c0793SHans Petter Selasky 	 * Make sure the epoch_call_task() function see e_in_use equal
430826c0793SHans Petter Selasky 	 * to zero, by calling epoch_wait() on the global_epoch:
431826c0793SHans Petter Selasky 	 */
43270398c2fSMatt Macy 	epoch_wait(global_epoch);
4337667824aSKyle Evans #ifdef INVARIANTS
4347667824aSKyle Evans 	CPU_FOREACH(cpu) {
4357667824aSKyle Evans 		epoch_record_t er;
4367667824aSKyle Evans 
4377667824aSKyle Evans 		er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
4387667824aSKyle Evans 
4397667824aSKyle Evans 		/*
4407667824aSKyle Evans 		 * Sanity check: none of the records should be in use anymore.
4417667824aSKyle Evans 		 * We drained callbacks above and freeing the pcpu records is
4427667824aSKyle Evans 		 * imminent.
4437667824aSKyle Evans 		 */
4447667824aSKyle Evans 		MPASS(er->er_td == NULL);
4457667824aSKyle Evans 		MPASS(TAILQ_EMPTY(&er->er_tdlist));
4467667824aSKyle Evans 	}
4477667824aSKyle Evans #endif
448822e50e3SMatt Macy 	uma_zfree_pcpu(pcpu_zone_record, epoch->e_pcpu_record);
449131b2b76SHans Petter Selasky 	mtx_destroy(&epoch->e_drain_mtx);
450131b2b76SHans Petter Selasky 	sx_destroy(&epoch->e_drain_sx);
451826c0793SHans Petter Selasky 	memset(epoch, 0, sizeof(*epoch));
452826c0793SHans Petter Selasky 
453826c0793SHans Petter Selasky 	EPOCH_UNLOCK();
45406bf2a6aSMatt Macy }
45506bf2a6aSMatt Macy 
456a82296c2SGleb Smirnoff #define INIT_CHECK(epoch)					\
457a82296c2SGleb Smirnoff 	do {							\
458a82296c2SGleb Smirnoff 		if (__predict_false((epoch) == NULL))		\
459a82296c2SGleb Smirnoff 			return;					\
460a82296c2SGleb Smirnoff 	} while (0)
461a82296c2SGleb Smirnoff 
462a82296c2SGleb Smirnoff void
_epoch_enter_preempt(epoch_t epoch,epoch_tracker_t et EPOCH_FILE_LINE)463dd902d01SGleb Smirnoff _epoch_enter_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE)
464a82296c2SGleb Smirnoff {
465a82296c2SGleb Smirnoff 	struct epoch_record *er;
466a760c50cSGleb Smirnoff 	struct thread *td;
467a82296c2SGleb Smirnoff 
468a82296c2SGleb Smirnoff 	MPASS(cold || epoch != NULL);
469a760c50cSGleb Smirnoff 	td = curthread;
470fa1d803cSBrooks Davis 	MPASS(kstack_contains(td, (vm_offset_t)et, sizeof(*et)));
47177d70e51SGleb Smirnoff 
47277d70e51SGleb Smirnoff 	INIT_CHECK(epoch);
473c82c2006SHans Petter Selasky 	MPASS(epoch->e_flags & EPOCH_PREEMPT);
474c82c2006SHans Petter Selasky 
475dd902d01SGleb Smirnoff #ifdef EPOCH_TRACE
476dd902d01SGleb Smirnoff 	epoch_trace_enter(td, epoch, et, file, line);
477dd902d01SGleb Smirnoff #endif
478a760c50cSGleb Smirnoff 	et->et_td = td;
4795757b59fSGleb Smirnoff 	THREAD_NO_SLEEPING();
480a82296c2SGleb Smirnoff 	critical_enter();
481a760c50cSGleb Smirnoff 	sched_pin();
482ef0f7ae9SHans Petter Selasky 	et->et_old_priority = td->td_priority;
483a82296c2SGleb Smirnoff 	er = epoch_currecord(epoch);
4847667824aSKyle Evans 	/* Record-level tracking is reserved for non-preemptible epochs. */
4857667824aSKyle Evans 	MPASS(er->er_td == NULL);
4869f360eecSGleb Smirnoff 	TAILQ_INSERT_TAIL(&er->er_tdlist, et, et_link);
48791cf4975SMatt Macy 	ck_epoch_begin(&er->er_record, &et->et_section);
488a82296c2SGleb Smirnoff 	critical_exit();
489c4d901e9SMatt Macy }
490c4d901e9SMatt Macy 
49106bf2a6aSMatt Macy void
epoch_enter(epoch_t epoch)492a82296c2SGleb Smirnoff epoch_enter(epoch_t epoch)
49306bf2a6aSMatt Macy {
494a82296c2SGleb Smirnoff 	epoch_record_t er;
49506bf2a6aSMatt Macy 
496a82296c2SGleb Smirnoff 	MPASS(cold || epoch != NULL);
497a82296c2SGleb Smirnoff 	INIT_CHECK(epoch);
498a82296c2SGleb Smirnoff 	critical_enter();
499a82296c2SGleb Smirnoff 	er = epoch_currecord(epoch);
5007667824aSKyle Evans #ifdef INVARIANTS
5017667824aSKyle Evans 	if (er->er_record.active == 0) {
5027667824aSKyle Evans 		MPASS(er->er_td == NULL);
5037667824aSKyle Evans 		er->er_td = curthread;
5047667824aSKyle Evans 	} else {
5057667824aSKyle Evans 		/* We've recursed, just make sure our accounting isn't wrong. */
5067667824aSKyle Evans 		MPASS(er->er_td == curthread);
5077667824aSKyle Evans 	}
5087667824aSKyle Evans #endif
50991cf4975SMatt Macy 	ck_epoch_begin(&er->er_record, NULL);
51006bf2a6aSMatt Macy }
51106bf2a6aSMatt Macy 
5125e68a3dfSMatt Macy void
_epoch_exit_preempt(epoch_t epoch,epoch_tracker_t et EPOCH_FILE_LINE)513dd902d01SGleb Smirnoff _epoch_exit_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE)
514c4d901e9SMatt Macy {
515a82296c2SGleb Smirnoff 	struct epoch_record *er;
516a760c50cSGleb Smirnoff 	struct thread *td;
517c4d901e9SMatt Macy 
518a82296c2SGleb Smirnoff 	INIT_CHECK(epoch);
519a760c50cSGleb Smirnoff 	td = curthread;
520a82296c2SGleb Smirnoff 	critical_enter();
521a760c50cSGleb Smirnoff 	sched_unpin();
5225757b59fSGleb Smirnoff 	THREAD_SLEEPING_OK();
523a82296c2SGleb Smirnoff 	er = epoch_currecord(epoch);
524a82296c2SGleb Smirnoff 	MPASS(epoch->e_flags & EPOCH_PREEMPT);
5259f360eecSGleb Smirnoff 	MPASS(et != NULL);
526a760c50cSGleb Smirnoff 	MPASS(et->et_td == td);
5279f360eecSGleb Smirnoff #ifdef INVARIANTS
5289f360eecSGleb Smirnoff 	et->et_td = (void*)0xDEADBEEF;
5297667824aSKyle Evans 	/* Record-level tracking is reserved for non-preemptible epochs. */
5307667824aSKyle Evans 	MPASS(er->er_td == NULL);
5319f360eecSGleb Smirnoff #endif
53291cf4975SMatt Macy 	ck_epoch_end(&er->er_record, &et->et_section);
5339f360eecSGleb Smirnoff 	TAILQ_REMOVE(&er->er_tdlist, et, et_link);
534a82296c2SGleb Smirnoff 	er->er_gen++;
535ef0f7ae9SHans Petter Selasky 	if (__predict_false(et->et_old_priority != td->td_priority))
536ef0f7ae9SHans Petter Selasky 		epoch_adjust_prio(td, et->et_old_priority);
537a82296c2SGleb Smirnoff 	critical_exit();
538dd902d01SGleb Smirnoff #ifdef EPOCH_TRACE
539dd902d01SGleb Smirnoff 	epoch_trace_exit(td, epoch, et, file, line);
540dd902d01SGleb Smirnoff #endif
5416573d758SMatt Macy }
5426573d758SMatt Macy 
5436573d758SMatt Macy void
epoch_exit(epoch_t epoch)544a82296c2SGleb Smirnoff epoch_exit(epoch_t epoch)
5456573d758SMatt Macy {
546a82296c2SGleb Smirnoff 	epoch_record_t er;
5476573d758SMatt Macy 
548a82296c2SGleb Smirnoff 	INIT_CHECK(epoch);
549a82296c2SGleb Smirnoff 	er = epoch_currecord(epoch);
55091cf4975SMatt Macy 	ck_epoch_end(&er->er_record, NULL);
5517667824aSKyle Evans #ifdef INVARIANTS
5527667824aSKyle Evans 	MPASS(er->er_td == curthread);
5537667824aSKyle Evans 	if (er->er_record.active == 0)
5547667824aSKyle Evans 		er->er_td = NULL;
5557667824aSKyle Evans #endif
556a82296c2SGleb Smirnoff 	critical_exit();
557c4d901e9SMatt Macy }
558c4d901e9SMatt Macy 
55906bf2a6aSMatt Macy /*
560635c1884SGleb Smirnoff  * epoch_block_handler_preempt() is a callback from the CK code when another
561635c1884SGleb Smirnoff  * thread is currently in an epoch section.
56206bf2a6aSMatt Macy  */
56306bf2a6aSMatt Macy static void
epoch_block_handler_preempt(struct ck_epoch * global __unused,ck_epoch_record_t * cr,void * arg __unused)564635c1884SGleb Smirnoff epoch_block_handler_preempt(struct ck_epoch *global __unused,
565635c1884SGleb Smirnoff     ck_epoch_record_t *cr, void *arg __unused)
56606bf2a6aSMatt Macy {
56706bf2a6aSMatt Macy 	epoch_record_t record;
5686573d758SMatt Macy 	struct thread *td, *owner, *curwaittd;
5699f360eecSGleb Smirnoff 	struct epoch_tracker *tdwait;
57006bf2a6aSMatt Macy 	struct turnstile *ts;
57106bf2a6aSMatt Macy 	struct lock_object *lock;
572b2cb2896SMatt Macy 	int spincount, gen;
57374333b3dSMatt Macy 	int locksheld __unused;
57406bf2a6aSMatt Macy 
57591cf4975SMatt Macy 	record = __containerof(cr, struct epoch_record, er_record);
57606bf2a6aSMatt Macy 	td = curthread;
57774333b3dSMatt Macy 	locksheld = td->td_locks;
57806bf2a6aSMatt Macy 	spincount = 0;
57906bf2a6aSMatt Macy 	counter_u64_add(block_count, 1);
5809fec45d8SMatt Macy 	/*
5819fec45d8SMatt Macy 	 * We lost a race and there's no longer any threads
5829fec45d8SMatt Macy 	 * on the CPU in an epoch section.
5839fec45d8SMatt Macy 	 */
5849fec45d8SMatt Macy 	if (TAILQ_EMPTY(&record->er_tdlist))
5859fec45d8SMatt Macy 		return;
5869fec45d8SMatt Macy 
58706bf2a6aSMatt Macy 	if (record->er_cpuid != curcpu) {
58806bf2a6aSMatt Macy 		/*
58906bf2a6aSMatt Macy 		 * If the head of the list is running, we can wait for it
59006bf2a6aSMatt Macy 		 * to remove itself from the list and thus save us the
59106bf2a6aSMatt Macy 		 * overhead of a migration
59206bf2a6aSMatt Macy 		 */
593b2cb2896SMatt Macy 		gen = record->er_gen;
594b2cb2896SMatt Macy 		thread_unlock(td);
5959fec45d8SMatt Macy 		/*
5969fec45d8SMatt Macy 		 * We can't actually check if the waiting thread is running
5979fec45d8SMatt Macy 		 * so we simply poll for it to exit before giving up and
5989fec45d8SMatt Macy 		 * migrating.
5999fec45d8SMatt Macy 		 */
600b2cb2896SMatt Macy 		do {
60106bf2a6aSMatt Macy 			cpu_spinwait();
6029fec45d8SMatt Macy 		} while (!TAILQ_EMPTY(&record->er_tdlist) &&
6039fec45d8SMatt Macy 				 gen == record->er_gen &&
604b2cb2896SMatt Macy 				 spincount++ < MAX_ADAPTIVE_SPIN);
605b2cb2896SMatt Macy 		thread_lock(td);
6069fec45d8SMatt Macy 		/*
6079fec45d8SMatt Macy 		 * If the generation has changed we can poll again
6089fec45d8SMatt Macy 		 * otherwise we need to migrate.
6099fec45d8SMatt Macy 		 */
6109fec45d8SMatt Macy 		if (gen != record->er_gen)
61106bf2a6aSMatt Macy 			return;
61206bf2a6aSMatt Macy 		/*
61306bf2a6aSMatt Macy 		 * Being on the same CPU as that of the record on which
61406bf2a6aSMatt Macy 		 * we need to wait allows us access to the thread
61506bf2a6aSMatt Macy 		 * list associated with that CPU. We can then examine the
61606bf2a6aSMatt Macy 		 * oldest thread in the queue and wait on its turnstile
61706bf2a6aSMatt Macy 		 * until it resumes and so on until a grace period
61806bf2a6aSMatt Macy 		 * elapses.
61906bf2a6aSMatt Macy 		 *
62006bf2a6aSMatt Macy 		 */
62106bf2a6aSMatt Macy 		counter_u64_add(migrate_count, 1);
62206bf2a6aSMatt Macy 		sched_bind(td, record->er_cpuid);
62306bf2a6aSMatt Macy 		/*
62406bf2a6aSMatt Macy 		 * At this point we need to return to the ck code
62506bf2a6aSMatt Macy 		 * to scan to see if a grace period has elapsed.
62606bf2a6aSMatt Macy 		 * We can't move on to check the thread list, because
62706bf2a6aSMatt Macy 		 * in the meantime new threads may have arrived that
62806bf2a6aSMatt Macy 		 * in fact belong to a different epoch.
62906bf2a6aSMatt Macy 		 */
63006bf2a6aSMatt Macy 		return;
63106bf2a6aSMatt Macy 	}
63206bf2a6aSMatt Macy 	/*
63306bf2a6aSMatt Macy 	 * Try to find a thread in an epoch section on this CPU
63406bf2a6aSMatt Macy 	 * waiting on a turnstile. Otherwise find the lowest
63506bf2a6aSMatt Macy 	 * priority thread (highest prio value) and drop our priority
63606bf2a6aSMatt Macy 	 * to match to allow it to run.
63706bf2a6aSMatt Macy 	 */
6386573d758SMatt Macy 	TAILQ_FOREACH(tdwait, &record->er_tdlist, et_link) {
639b2cb2896SMatt Macy 		/*
640b2cb2896SMatt Macy 		 * Propagate our priority to any other waiters to prevent us
641b2cb2896SMatt Macy 		 * from starving them. They will have their original priority
642b2cb2896SMatt Macy 		 * restore on exit from epoch_wait().
643b2cb2896SMatt Macy 		 */
6446573d758SMatt Macy 		curwaittd = tdwait->et_td;
6456573d758SMatt Macy 		if (!TD_IS_INHIBITED(curwaittd) && curwaittd->td_priority > td->td_priority) {
646d1bcb409SMatt Macy 			critical_enter();
647d1bcb409SMatt Macy 			thread_unlock(td);
6486573d758SMatt Macy 			thread_lock(curwaittd);
6496573d758SMatt Macy 			sched_prio(curwaittd, td->td_priority);
6506573d758SMatt Macy 			thread_unlock(curwaittd);
651d1bcb409SMatt Macy 			thread_lock(td);
652d1bcb409SMatt Macy 			critical_exit();
653b2cb2896SMatt Macy 		}
6546573d758SMatt Macy 		if (TD_IS_INHIBITED(curwaittd) && TD_ON_LOCK(curwaittd) &&
6556573d758SMatt Macy 		    ((ts = curwaittd->td_blocked) != NULL)) {
65606bf2a6aSMatt Macy 			/*
657635c1884SGleb Smirnoff 			 * We unlock td to allow turnstile_wait to reacquire
658635c1884SGleb Smirnoff 			 * the thread lock. Before unlocking it we enter a
659635c1884SGleb Smirnoff 			 * critical section to prevent preemption after we
660635c1884SGleb Smirnoff 			 * reenable interrupts by dropping the thread lock in
661635c1884SGleb Smirnoff 			 * order to prevent curwaittd from getting to run.
66206bf2a6aSMatt Macy 			 */
66306bf2a6aSMatt Macy 			critical_enter();
66406bf2a6aSMatt Macy 			thread_unlock(td);
6652fb62b1aSMark Johnston 
6662fb62b1aSMark Johnston 			if (turnstile_lock(ts, &lock, &owner)) {
6672fb62b1aSMark Johnston 				if (ts == curwaittd->td_blocked) {
668635c1884SGleb Smirnoff 					MPASS(TD_IS_INHIBITED(curwaittd) &&
669635c1884SGleb Smirnoff 					    TD_ON_LOCK(curwaittd));
67006bf2a6aSMatt Macy 					critical_exit();
6712fb62b1aSMark Johnston 					turnstile_wait(ts, owner,
6722fb62b1aSMark Johnston 					    curwaittd->td_tsqueue);
67306bf2a6aSMatt Macy 					counter_u64_add(turnstile_count, 1);
67406bf2a6aSMatt Macy 					thread_lock(td);
67506bf2a6aSMatt Macy 					return;
6762fb62b1aSMark Johnston 				}
67706bf2a6aSMatt Macy 				turnstile_unlock(ts, lock);
6782fb62b1aSMark Johnston 			}
67906bf2a6aSMatt Macy 			thread_lock(td);
68006bf2a6aSMatt Macy 			critical_exit();
68174333b3dSMatt Macy 			KASSERT(td->td_locks == locksheld,
68274333b3dSMatt Macy 			    ("%d extra locks held", td->td_locks - locksheld));
68306bf2a6aSMatt Macy 		}
68406bf2a6aSMatt Macy 	}
68506bf2a6aSMatt Macy 	/*
68606bf2a6aSMatt Macy 	 * We didn't find any threads actually blocked on a lock
687b2cb2896SMatt Macy 	 * so we have nothing to do except context switch away.
68806bf2a6aSMatt Macy 	 */
68906bf2a6aSMatt Macy 	counter_u64_add(switch_count, 1);
690686bcb5cSJeff Roberson 	mi_switch(SW_VOL | SWT_RELINQUISH);
691cc79ea3aSHans Petter Selasky 	/*
692cc79ea3aSHans Petter Selasky 	 * It is important the thread lock is dropped while yielding
693cc79ea3aSHans Petter Selasky 	 * to allow other threads to acquire the lock pointed to by
694cc79ea3aSHans Petter Selasky 	 * TDQ_LOCKPTR(td). Currently mi_switch() will unlock the
695cc79ea3aSHans Petter Selasky 	 * thread lock before returning. Else a deadlock like
696cc79ea3aSHans Petter Selasky 	 * situation might happen.
697cc79ea3aSHans Petter Selasky 	 */
69806bf2a6aSMatt Macy 	thread_lock(td);
69906bf2a6aSMatt Macy }
70006bf2a6aSMatt Macy 
70106bf2a6aSMatt Macy void
epoch_wait_preempt(epoch_t epoch)70270398c2fSMatt Macy epoch_wait_preempt(epoch_t epoch)
70306bf2a6aSMatt Macy {
70406bf2a6aSMatt Macy 	struct thread *td;
70506bf2a6aSMatt Macy 	int was_bound;
70606bf2a6aSMatt Macy 	int old_cpu;
70706bf2a6aSMatt Macy 	int old_pinned;
70806bf2a6aSMatt Macy 	u_char old_prio;
70974333b3dSMatt Macy 	int locks __unused;
71020ba6811SMatt Macy 
71120ba6811SMatt Macy 	MPASS(cold || epoch != NULL);
71206bf2a6aSMatt Macy 	INIT_CHECK(epoch);
71306bf2a6aSMatt Macy 	td = curthread;
71474333b3dSMatt Macy #ifdef INVARIANTS
71574333b3dSMatt Macy 	locks = curthread->td_locks;
71674333b3dSMatt Macy 	MPASS(epoch->e_flags & EPOCH_PREEMPT);
71774333b3dSMatt Macy 	if ((epoch->e_flags & EPOCH_LOCKED) == 0)
71874333b3dSMatt Macy 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
71974333b3dSMatt Macy 		    "epoch_wait() can be long running");
720635c1884SGleb Smirnoff 	KASSERT(!in_epoch(epoch), ("epoch_wait_preempt() called in the middle "
72110b8cd7fSMatt Macy 	    "of an epoch section of the same epoch"));
72274333b3dSMatt Macy #endif
72306bf2a6aSMatt Macy 	DROP_GIANT();
724fedab1b4SKonstantin Belousov 	thread_lock(td);
72506bf2a6aSMatt Macy 
72606bf2a6aSMatt Macy 	old_cpu = PCPU_GET(cpuid);
72706bf2a6aSMatt Macy 	old_pinned = td->td_pinned;
72806bf2a6aSMatt Macy 	old_prio = td->td_priority;
72906bf2a6aSMatt Macy 	was_bound = sched_is_bound(td);
73006bf2a6aSMatt Macy 	sched_unbind(td);
73106bf2a6aSMatt Macy 	td->td_pinned = 0;
73206bf2a6aSMatt Macy 	sched_bind(td, old_cpu);
73306bf2a6aSMatt Macy 
734635c1884SGleb Smirnoff 	ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler_preempt,
735635c1884SGleb Smirnoff 	    NULL);
73606bf2a6aSMatt Macy 
73706bf2a6aSMatt Macy 	/* restore CPU binding, if any */
73806bf2a6aSMatt Macy 	if (was_bound != 0) {
73906bf2a6aSMatt Macy 		sched_bind(td, old_cpu);
74006bf2a6aSMatt Macy 	} else {
74106bf2a6aSMatt Macy 		/* get thread back to initial CPU, if any */
74206bf2a6aSMatt Macy 		if (old_pinned != 0)
74306bf2a6aSMatt Macy 			sched_bind(td, old_cpu);
74406bf2a6aSMatt Macy 		sched_unbind(td);
74506bf2a6aSMatt Macy 	}
74606bf2a6aSMatt Macy 	/* restore pinned after bind */
74706bf2a6aSMatt Macy 	td->td_pinned = old_pinned;
74806bf2a6aSMatt Macy 
74906bf2a6aSMatt Macy 	/* restore thread priority */
75006bf2a6aSMatt Macy 	sched_prio(td, old_prio);
75106bf2a6aSMatt Macy 	thread_unlock(td);
75206bf2a6aSMatt Macy 	PICKUP_GIANT();
7530c58f85bSMatt Macy 	KASSERT(td->td_locks == locks,
7540c58f85bSMatt Macy 	    ("%d residual locks held", td->td_locks - locks));
75506bf2a6aSMatt Macy }
75606bf2a6aSMatt Macy 
7575e68a3dfSMatt Macy static void
epoch_block_handler(struct ck_epoch * g __unused,ck_epoch_record_t * c __unused,void * arg __unused)75870398c2fSMatt Macy epoch_block_handler(struct ck_epoch *g __unused, ck_epoch_record_t *c __unused,
7595e68a3dfSMatt Macy     void *arg __unused)
7605e68a3dfSMatt Macy {
7615e68a3dfSMatt Macy 	cpu_spinwait();
7625e68a3dfSMatt Macy }
7635e68a3dfSMatt Macy 
7645e68a3dfSMatt Macy void
epoch_wait(epoch_t epoch)76570398c2fSMatt Macy epoch_wait(epoch_t epoch)
7665e68a3dfSMatt Macy {
7675e68a3dfSMatt Macy 
76820ba6811SMatt Macy 	MPASS(cold || epoch != NULL);
76920ba6811SMatt Macy 	INIT_CHECK(epoch);
77070398c2fSMatt Macy 	MPASS(epoch->e_flags == 0);
7715e68a3dfSMatt Macy 	critical_enter();
77270398c2fSMatt Macy 	ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler, NULL);
7735e68a3dfSMatt Macy 	critical_exit();
7745e68a3dfSMatt Macy }
7755e68a3dfSMatt Macy 
77606bf2a6aSMatt Macy void
epoch_call(epoch_t epoch,epoch_callback_t callback,epoch_context_t ctx)77766c6c556SGleb Smirnoff epoch_call(epoch_t epoch, epoch_callback_t callback, epoch_context_t ctx)
77806bf2a6aSMatt Macy {
7796573d758SMatt Macy 	epoch_record_t er;
7802a45e828SMatt Macy 	ck_epoch_entry_t *cb;
78106bf2a6aSMatt Macy 
78206bf2a6aSMatt Macy 	cb = (void *)ctx;
783b2cb2896SMatt Macy 
7841f4beb63SMatt Macy 	MPASS(callback);
7851f4beb63SMatt Macy 	/* too early in boot to have epoch set up */
7862a45e828SMatt Macy 	if (__predict_false(epoch == NULL))
7872a45e828SMatt Macy 		goto boottime;
7880bcfb473SMatt Macy #if !defined(EARLY_AP_STARTUP)
7890bcfb473SMatt Macy 	if (__predict_false(inited < 2))
7900bcfb473SMatt Macy 		goto boottime;
7910bcfb473SMatt Macy #endif
7922a45e828SMatt Macy 
793b2cb2896SMatt Macy 	critical_enter();
794a5f10424SMatt Macy 	*DPCPU_PTR(epoch_cb_count) += 1;
795822e50e3SMatt Macy 	er = epoch_currecord(epoch);
79691cf4975SMatt Macy 	ck_epoch_call(&er->er_record, cb, (ck_epoch_cb_t *)callback);
797b2cb2896SMatt Macy 	critical_exit();
7982a45e828SMatt Macy 	return;
7992a45e828SMatt Macy boottime:
8002a45e828SMatt Macy 	callback(ctx);
80106bf2a6aSMatt Macy }
80206bf2a6aSMatt Macy 
803c4d901e9SMatt Macy static void
epoch_call_task(void * arg __unused)804c4d901e9SMatt Macy epoch_call_task(void *arg __unused)
805c4d901e9SMatt Macy {
806c4d901e9SMatt Macy 	ck_stack_entry_t *cursor, *head, *next;
807c4d901e9SMatt Macy 	ck_epoch_record_t *record;
808822e50e3SMatt Macy 	epoch_record_t er;
809c4d901e9SMatt Macy 	epoch_t epoch;
810c4d901e9SMatt Macy 	ck_stack_t cb_stack;
811c4d901e9SMatt Macy 	int i, npending, total;
812c4d901e9SMatt Macy 
813c4d901e9SMatt Macy 	ck_stack_init(&cb_stack);
814c4d901e9SMatt Macy 	critical_enter();
81570398c2fSMatt Macy 	epoch_enter(global_epoch);
816826c0793SHans Petter Selasky 	for (total = i = 0; i != MAX_EPOCHS; i++) {
817826c0793SHans Petter Selasky 		epoch = epoch_array + i;
818826c0793SHans Petter Selasky 		if (__predict_false(
819826c0793SHans Petter Selasky 		    atomic_load_acq_int(&epoch->e_in_use) == 0))
820c4d901e9SMatt Macy 			continue;
821822e50e3SMatt Macy 		er = epoch_currecord(epoch);
82291cf4975SMatt Macy 		record = &er->er_record;
823c4d901e9SMatt Macy 		if ((npending = record->n_pending) == 0)
824c4d901e9SMatt Macy 			continue;
825c4d901e9SMatt Macy 		ck_epoch_poll_deferred(record, &cb_stack);
826c4d901e9SMatt Macy 		total += npending - record->n_pending;
827b2cb2896SMatt Macy 	}
82870398c2fSMatt Macy 	epoch_exit(global_epoch);
829a5f10424SMatt Macy 	*DPCPU_PTR(epoch_cb_count) -= total;
830c4d901e9SMatt Macy 	critical_exit();
831c4d901e9SMatt Macy 
8325e68a3dfSMatt Macy 	counter_u64_add(epoch_call_count, total);
8335e68a3dfSMatt Macy 	counter_u64_add(epoch_call_task_count, 1);
8345e68a3dfSMatt Macy 
835c4d901e9SMatt Macy 	head = ck_stack_batch_pop_npsc(&cb_stack);
836c4d901e9SMatt Macy 	for (cursor = head; cursor != NULL; cursor = next) {
8372a45e828SMatt Macy 		struct ck_epoch_entry *entry =
8382a45e828SMatt Macy 		    ck_epoch_entry_container(cursor);
839e445381fSMatt Macy 
840c4d901e9SMatt Macy 		next = CK_STACK_NEXT(cursor);
8412a45e828SMatt Macy 		entry->function(entry);
84206bf2a6aSMatt Macy 	}
843b2cb2896SMatt Macy }
84406bf2a6aSMatt Macy 
8457667824aSKyle Evans static int
in_epoch_verbose_preempt(epoch_t epoch,int dump_onfail)8467667824aSKyle Evans in_epoch_verbose_preempt(epoch_t epoch, int dump_onfail)
84706bf2a6aSMatt Macy {
8487667824aSKyle Evans 	epoch_record_t er;
8499f360eecSGleb Smirnoff 	struct epoch_tracker *tdwait;
8506573d758SMatt Macy 	struct thread *td;
8516573d758SMatt Macy 
8527667824aSKyle Evans 	MPASS(epoch != NULL);
8537667824aSKyle Evans 	MPASS((epoch->e_flags & EPOCH_PREEMPT) != 0);
8546573d758SMatt Macy 	td = curthread;
8555757b59fSGleb Smirnoff 	if (THREAD_CAN_SLEEP())
8566573d758SMatt Macy 		return (0);
8576573d758SMatt Macy 	critical_enter();
858822e50e3SMatt Macy 	er = epoch_currecord(epoch);
8596573d758SMatt Macy 	TAILQ_FOREACH(tdwait, &er->er_tdlist, et_link)
8606573d758SMatt Macy 		if (tdwait->et_td == td) {
8616573d758SMatt Macy 			critical_exit();
8626573d758SMatt Macy 			return (1);
8636573d758SMatt Macy 		}
8646573d758SMatt Macy #ifdef INVARIANTS
8656573d758SMatt Macy 	if (dump_onfail) {
8666573d758SMatt Macy 		MPASS(td->td_pinned);
8676573d758SMatt Macy 		printf("cpu: %d id: %d\n", curcpu, td->td_tid);
8686573d758SMatt Macy 		TAILQ_FOREACH(tdwait, &er->er_tdlist, et_link)
8696573d758SMatt Macy 			printf("td_tid: %d ", tdwait->et_td->td_tid);
8706573d758SMatt Macy 		printf("\n");
8716573d758SMatt Macy 	}
8726573d758SMatt Macy #endif
8736573d758SMatt Macy 	critical_exit();
8746573d758SMatt Macy 	return (0);
8756573d758SMatt Macy }
8766573d758SMatt Macy 
8777667824aSKyle Evans #ifdef INVARIANTS
8787667824aSKyle Evans static void
epoch_assert_nocpu(epoch_t epoch,struct thread * td)8797667824aSKyle Evans epoch_assert_nocpu(epoch_t epoch, struct thread *td)
8807667824aSKyle Evans {
8817667824aSKyle Evans 	epoch_record_t er;
8827667824aSKyle Evans 	int cpu;
8837667824aSKyle Evans 	bool crit;
8847667824aSKyle Evans 
8857667824aSKyle Evans 	crit = td->td_critnest > 0;
8867667824aSKyle Evans 
8877667824aSKyle Evans 	/* Check for a critical section mishap. */
8887667824aSKyle Evans 	CPU_FOREACH(cpu) {
8897667824aSKyle Evans 		er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
8907667824aSKyle Evans 		KASSERT(er->er_td != td,
8917667824aSKyle Evans 		    ("%s critical section in epoch '%s', from cpu %d",
8927667824aSKyle Evans 		    (crit ? "exited" : "re-entered"), epoch->e_name, cpu));
8937667824aSKyle Evans 	}
8947667824aSKyle Evans }
8957667824aSKyle Evans #else
896f3316835SHans Petter Selasky #define	epoch_assert_nocpu(e, td) do {} while (0)
8977667824aSKyle Evans #endif
8987667824aSKyle Evans 
8997667824aSKyle Evans int
in_epoch_verbose(epoch_t epoch,int dump_onfail)9007667824aSKyle Evans in_epoch_verbose(epoch_t epoch, int dump_onfail)
9017667824aSKyle Evans {
9027667824aSKyle Evans 	epoch_record_t er;
9037667824aSKyle Evans 	struct thread *td;
9047667824aSKyle Evans 
9057667824aSKyle Evans 	if (__predict_false((epoch) == NULL))
9067667824aSKyle Evans 		return (0);
9077667824aSKyle Evans 	if ((epoch->e_flags & EPOCH_PREEMPT) != 0)
9087667824aSKyle Evans 		return (in_epoch_verbose_preempt(epoch, dump_onfail));
9097667824aSKyle Evans 
9107667824aSKyle Evans 	/*
9117667824aSKyle Evans 	 * The thread being in a critical section is a necessary
9127667824aSKyle Evans 	 * condition to be correctly inside a non-preemptible epoch,
9137667824aSKyle Evans 	 * so it's definitely not in this epoch.
9147667824aSKyle Evans 	 */
9157667824aSKyle Evans 	td = curthread;
9167667824aSKyle Evans 	if (td->td_critnest == 0) {
9177667824aSKyle Evans 		epoch_assert_nocpu(epoch, td);
9187667824aSKyle Evans 		return (0);
9197667824aSKyle Evans 	}
9207667824aSKyle Evans 
9217667824aSKyle Evans 	/*
9227667824aSKyle Evans 	 * The current cpu is in a critical section, so the epoch record will be
9237667824aSKyle Evans 	 * stable for the rest of this function.  Knowing that the record is not
9247667824aSKyle Evans 	 * active is sufficient for knowing whether we're in this epoch or not,
9257667824aSKyle Evans 	 * since it's a pcpu record.
9267667824aSKyle Evans 	 */
9277667824aSKyle Evans 	er = epoch_currecord(epoch);
9287667824aSKyle Evans 	if (er->er_record.active == 0) {
9297667824aSKyle Evans 		epoch_assert_nocpu(epoch, td);
9307667824aSKyle Evans 		return (0);
9317667824aSKyle Evans 	}
9327667824aSKyle Evans 
9337667824aSKyle Evans 	MPASS(er->er_td == td);
9347667824aSKyle Evans 	return (1);
9357667824aSKyle Evans }
9367667824aSKyle Evans 
9376573d758SMatt Macy int
in_epoch(epoch_t epoch)9386573d758SMatt Macy in_epoch(epoch_t epoch)
9396573d758SMatt Macy {
9406573d758SMatt Macy 	return (in_epoch_verbose(epoch, 0));
9416573d758SMatt Macy }
942b79aa45eSGleb Smirnoff 
943131b2b76SHans Petter Selasky static void
epoch_drain_cb(struct epoch_context * ctx)944131b2b76SHans Petter Selasky epoch_drain_cb(struct epoch_context *ctx)
945131b2b76SHans Petter Selasky {
946131b2b76SHans Petter Selasky 	struct epoch *epoch =
947131b2b76SHans Petter Selasky 	    __containerof(ctx, struct epoch_record, er_drain_ctx)->er_parent;
948131b2b76SHans Petter Selasky 
949131b2b76SHans Petter Selasky 	if (atomic_fetchadd_int(&epoch->e_drain_count, -1) == 1) {
950131b2b76SHans Petter Selasky 		mtx_lock(&epoch->e_drain_mtx);
951131b2b76SHans Petter Selasky 		wakeup(epoch);
952131b2b76SHans Petter Selasky 		mtx_unlock(&epoch->e_drain_mtx);
953131b2b76SHans Petter Selasky 	}
954131b2b76SHans Petter Selasky }
955131b2b76SHans Petter Selasky 
956131b2b76SHans Petter Selasky void
epoch_drain_callbacks(epoch_t epoch)957131b2b76SHans Petter Selasky epoch_drain_callbacks(epoch_t epoch)
958131b2b76SHans Petter Selasky {
959131b2b76SHans Petter Selasky 	epoch_record_t er;
960131b2b76SHans Petter Selasky 	struct thread *td;
961131b2b76SHans Petter Selasky 	int was_bound;
962131b2b76SHans Petter Selasky 	int old_pinned;
963131b2b76SHans Petter Selasky 	int old_cpu;
964131b2b76SHans Petter Selasky 	int cpu;
965131b2b76SHans Petter Selasky 
966131b2b76SHans Petter Selasky 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
967131b2b76SHans Petter Selasky 	    "epoch_drain_callbacks() may sleep!");
968131b2b76SHans Petter Selasky 
969131b2b76SHans Petter Selasky 	/* too early in boot to have epoch set up */
970131b2b76SHans Petter Selasky 	if (__predict_false(epoch == NULL))
971131b2b76SHans Petter Selasky 		return;
972131b2b76SHans Petter Selasky #if !defined(EARLY_AP_STARTUP)
973131b2b76SHans Petter Selasky 	if (__predict_false(inited < 2))
974131b2b76SHans Petter Selasky 		return;
975131b2b76SHans Petter Selasky #endif
976131b2b76SHans Petter Selasky 	DROP_GIANT();
977131b2b76SHans Petter Selasky 
978131b2b76SHans Petter Selasky 	sx_xlock(&epoch->e_drain_sx);
979131b2b76SHans Petter Selasky 	mtx_lock(&epoch->e_drain_mtx);
980131b2b76SHans Petter Selasky 
981131b2b76SHans Petter Selasky 	td = curthread;
982131b2b76SHans Petter Selasky 	thread_lock(td);
983131b2b76SHans Petter Selasky 	old_cpu = PCPU_GET(cpuid);
984131b2b76SHans Petter Selasky 	old_pinned = td->td_pinned;
985131b2b76SHans Petter Selasky 	was_bound = sched_is_bound(td);
986131b2b76SHans Petter Selasky 	sched_unbind(td);
987131b2b76SHans Petter Selasky 	td->td_pinned = 0;
988131b2b76SHans Petter Selasky 
989131b2b76SHans Petter Selasky 	CPU_FOREACH(cpu)
990131b2b76SHans Petter Selasky 		epoch->e_drain_count++;
991131b2b76SHans Petter Selasky 	CPU_FOREACH(cpu) {
992131b2b76SHans Petter Selasky 		er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
993131b2b76SHans Petter Selasky 		sched_bind(td, cpu);
99466c6c556SGleb Smirnoff 		epoch_call(epoch, &epoch_drain_cb, &er->er_drain_ctx);
995131b2b76SHans Petter Selasky 	}
996131b2b76SHans Petter Selasky 
997131b2b76SHans Petter Selasky 	/* restore CPU binding, if any */
998131b2b76SHans Petter Selasky 	if (was_bound != 0) {
999131b2b76SHans Petter Selasky 		sched_bind(td, old_cpu);
1000131b2b76SHans Petter Selasky 	} else {
1001131b2b76SHans Petter Selasky 		/* get thread back to initial CPU, if any */
1002131b2b76SHans Petter Selasky 		if (old_pinned != 0)
1003131b2b76SHans Petter Selasky 			sched_bind(td, old_cpu);
1004131b2b76SHans Petter Selasky 		sched_unbind(td);
1005131b2b76SHans Petter Selasky 	}
1006131b2b76SHans Petter Selasky 	/* restore pinned after bind */
1007131b2b76SHans Petter Selasky 	td->td_pinned = old_pinned;
1008131b2b76SHans Petter Selasky 
1009131b2b76SHans Petter Selasky 	thread_unlock(td);
1010131b2b76SHans Petter Selasky 
1011131b2b76SHans Petter Selasky 	while (epoch->e_drain_count != 0)
1012131b2b76SHans Petter Selasky 		msleep(epoch, &epoch->e_drain_mtx, PZERO, "EDRAIN", 0);
1013131b2b76SHans Petter Selasky 
1014131b2b76SHans Petter Selasky 	mtx_unlock(&epoch->e_drain_mtx);
1015131b2b76SHans Petter Selasky 	sx_xunlock(&epoch->e_drain_sx);
1016131b2b76SHans Petter Selasky 
1017131b2b76SHans Petter Selasky 	PICKUP_GIANT();
1018131b2b76SHans Petter Selasky }
1019