xref: /netbsd-src/sys/kern/kern_turnstile.c (revision e6c7e151de239c49d2e38720a061ed9d1fa99309)
1 /*	$NetBSD: kern_turnstile.c,v 1.38 2020/03/26 22:43:19 ad Exp $	*/
2 
3 /*-
4  * Copyright (c) 2002, 2006, 2007, 2009, 2019, 2020
5  *     The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Jason R. Thorpe and Andrew Doran.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /*
34  * Turnstiles are described in detail in:
35  *
36  *	Solaris Internals: Core Kernel Architecture, Jim Mauro and
37  *	    Richard McDougall.
38  *
39  * Turnstiles are kept in a hash table.  There are likely to be many more
40  * synchronisation objects than there are threads.  Since a thread can block
41  * on only one lock at a time, we only need one turnstile per thread, and
42  * so they are allocated at thread creation time.
43  *
44  * When a thread decides it needs to block on a lock, it looks up the
45  * active turnstile for that lock.  If no active turnstile exists, then
46  * the process lends its turnstile to the lock.  If there is already an
47  * active turnstile for the lock, the thread places its turnstile on a
48  * list of free turnstiles, and references the active one instead.
49  *
50  * The act of looking up the turnstile acquires an interlock on the sleep
51  * queue.  If a thread decides it doesn't need to block after all, then this
52  * interlock must be released by explicitly aborting the turnstile
53  * operation.
54  *
55  * When a thread is awakened, it needs to get its turnstile back.  If there
56  * are still other threads waiting in the active turnstile, the thread
57  * grabs a free turnstile off the free list.  Otherwise, it can take back
58  * the active turnstile from the lock (thus deactivating the turnstile).
59  *
60  * Turnstiles are where we do priority inheritence.
61  */
62 
63 #include <sys/cdefs.h>
64 __KERNEL_RCSID(0, "$NetBSD: kern_turnstile.c,v 1.38 2020/03/26 22:43:19 ad Exp $");
65 
66 #include <sys/param.h>
67 #include <sys/lockdebug.h>
68 #include <sys/pool.h>
69 #include <sys/proc.h>
70 #include <sys/sleepq.h>
71 #include <sys/systm.h>
72 
73 /*
74  * Shift of 6 aligns to typical cache line size of 64 bytes;  there's no
75  * point having two turnstile locks to back two lock objects that share one
76  * cache line.
77  */
78 #define	TS_HASH_SIZE	128
79 #define	TS_HASH_MASK	(TS_HASH_SIZE - 1)
80 #define	TS_HASH(obj)	(((uintptr_t)(obj) >> 6) & TS_HASH_MASK)
81 
82 static tschain_t	turnstile_chains[TS_HASH_SIZE] __cacheline_aligned;
83 pool_cache_t		turnstile_cache __read_mostly;
84 extern turnstile_t	turnstile0;
85 
86 static union {
87 	kmutex_t	lock;
88 	uint8_t		pad[COHERENCY_UNIT];
89 } turnstile_locks[TS_HASH_SIZE] __cacheline_aligned;
90 
91 static int		turnstile_ctor(void *, void *, int);
92 
93 /*
94  * turnstile_init:
95  *
96  *	Initialize the turnstile mechanism.
97  */
98 void
99 turnstile_init(void)
100 {
101 	int i;
102 
103 	for (i = 0; i < TS_HASH_SIZE; i++) {
104 		LIST_INIT(&turnstile_chains[i]);
105 		mutex_init(&turnstile_locks[i].lock, MUTEX_DEFAULT, IPL_SCHED);
106 	}
107 
108 	turnstile_cache = pool_cache_init(sizeof(turnstile_t), coherency_unit,
109 	    0, 0, "tstile", NULL, IPL_NONE, turnstile_ctor, NULL, NULL);
110 	KASSERT(turnstile_cache != NULL);
111 
112 	(void)turnstile_ctor(NULL, &turnstile0, 0);
113 }
114 
115 /*
116  * turnstile_ctor:
117  *
118  *	Constructor for turnstiles.
119  */
120 static int
121 turnstile_ctor(void *arg, void *obj, int flags)
122 {
123 	turnstile_t *ts = obj;
124 
125 	memset(ts, 0, sizeof(*ts));
126 	sleepq_init(&ts->ts_sleepq[TS_READER_Q]);
127 	sleepq_init(&ts->ts_sleepq[TS_WRITER_Q]);
128 	return (0);
129 }
130 
131 /*
132  * turnstile_remove:
133  *
134  *	Remove an LWP from a turnstile sleep queue and wake it.
135  */
136 static inline void
137 turnstile_remove(turnstile_t *ts, lwp_t *l, int q)
138 {
139 	turnstile_t *nts;
140 
141 	KASSERT(l->l_ts == ts);
142 
143 	/*
144 	 * This process is no longer using the active turnstile.
145 	 * Find an inactive one on the free list to give to it.
146 	 */
147 	if ((nts = ts->ts_free) != NULL) {
148 		KASSERT(TS_ALL_WAITERS(ts) > 1);
149 		l->l_ts = nts;
150 		ts->ts_free = nts->ts_free;
151 		nts->ts_free = NULL;
152 	} else {
153 		/*
154 		 * If the free list is empty, this is the last
155 		 * waiter.
156 		 */
157 		KASSERT(TS_ALL_WAITERS(ts) == 1);
158 		LIST_REMOVE(ts, ts_chain);
159 	}
160 
161 	ts->ts_waiters[q]--;
162 	sleepq_remove(&ts->ts_sleepq[q], l);
163 }
164 
165 /*
166  * turnstile_lookup:
167  *
168  *	Look up the turnstile for the specified lock.  This acquires and
169  *	holds the turnstile chain lock (sleep queue interlock).
170  */
171 turnstile_t *
172 turnstile_lookup(wchan_t obj)
173 {
174 	turnstile_t *ts;
175 	tschain_t *tc;
176 	u_int hash;
177 
178 	hash = TS_HASH(obj);
179 	tc = &turnstile_chains[hash];
180 	mutex_spin_enter(&turnstile_locks[hash].lock);
181 
182 	LIST_FOREACH(ts, tc, ts_chain)
183 		if (ts->ts_obj == obj)
184 			return (ts);
185 
186 	/*
187 	 * No turnstile yet for this lock.  No problem, turnstile_block()
188 	 * handles this by fetching the turnstile from the blocking thread.
189 	 */
190 	return (NULL);
191 }
192 
193 /*
194  * turnstile_exit:
195  *
196  *	Abort a turnstile operation.
197  */
198 void
199 turnstile_exit(wchan_t obj)
200 {
201 
202 	mutex_spin_exit(&turnstile_locks[TS_HASH(obj)].lock);
203 }
204 
205 /*
206  * turnstile_lendpri:
207  *
208  *	Lend our priority to lwps on the blocking chain.
209  *
210  *	If the current owner of the lock (l->l_wchan, set by sleepq_enqueue)
211  *	has a priority lower than ours (lwp_eprio(l)), lend our priority to
212  *	him to avoid priority inversions.
213  */
214 
215 static void
216 turnstile_lendpri(lwp_t *cur)
217 {
218 	lwp_t * l = cur;
219 	pri_t prio;
220 
221 	/*
222 	 * NOTE: if you get a panic in this code block, it is likely that
223 	 * a lock has been destroyed or corrupted while still in use.  Try
224 	 * compiling a kernel with LOCKDEBUG to pinpoint the problem.
225 	 */
226 
227 	LOCKDEBUG_BARRIER(l->l_mutex, 1);
228 	KASSERT(l == curlwp);
229 	prio = lwp_eprio(l);
230 	for (;;) {
231 		lwp_t *owner;
232 		turnstile_t *ts;
233 		bool dolock;
234 
235 		if (l->l_wchan == NULL)
236 			break;
237 
238 		/*
239 		 * Ask syncobj the owner of the lock.
240 		 */
241 		owner = (*l->l_syncobj->sobj_owner)(l->l_wchan);
242 		if (owner == NULL)
243 			break;
244 
245 		/*
246 		 * The owner may have changed as we have dropped the tc lock.
247 		 */
248 		if (cur == owner) {
249 			/*
250 			 * We own the lock: stop here, sleepq_block()
251 			 * should wake up immediatly.
252 			 */
253 			break;
254 		}
255 		/*
256 		 * Acquire owner->l_mutex if we don't have it yet.
257 		 * Because we already have another LWP lock (l->l_mutex) held,
258 		 * we need to play a try lock dance to avoid deadlock.
259 		 */
260 		dolock = l->l_mutex != owner->l_mutex;
261 		if (l == owner || (dolock && !lwp_trylock(owner))) {
262 			/*
263 			 * The owner was changed behind us or trylock failed.
264 			 * Restart from curlwp.
265 			 *
266 			 * Note that there may be a livelock here:
267 			 * the owner may try grabing cur's lock (which is the
268 			 * tc lock) while we're trying to grab the owner's lock.
269 			 */
270 			lwp_unlock(l);
271 			l = cur;
272 			lwp_lock(l);
273 			prio = lwp_eprio(l);
274 			continue;
275 		}
276 		/*
277 		 * If the owner's priority is already higher than ours,
278 		 * there's nothing to do anymore.
279 		 */
280 		if (prio <= lwp_eprio(owner)) {
281 			if (dolock)
282 				lwp_unlock(owner);
283 			break;
284 		}
285 		/*
286 		 * Lend our priority to the 'owner' LWP.
287 		 *
288 		 * Update lenders info for turnstile_unlendpri.
289 		 */
290 		ts = l->l_ts;
291 		KASSERT(ts->ts_inheritor == owner || ts->ts_inheritor == NULL);
292 		if (ts->ts_inheritor == NULL) {
293 			ts->ts_inheritor = owner;
294 			ts->ts_eprio = prio;
295 			SLIST_INSERT_HEAD(&owner->l_pi_lenders, ts, ts_pichain);
296 			lwp_lendpri(owner, prio);
297 		} else if (prio > ts->ts_eprio) {
298 			ts->ts_eprio = prio;
299 			lwp_lendpri(owner, prio);
300 		}
301 		if (dolock)
302 			lwp_unlock(l);
303 		LOCKDEBUG_BARRIER(owner->l_mutex, 1);
304 		l = owner;
305 	}
306 	LOCKDEBUG_BARRIER(l->l_mutex, 1);
307 	if (cur->l_mutex != l->l_mutex) {
308 		lwp_unlock(l);
309 		lwp_lock(cur);
310 	}
311 	LOCKDEBUG_BARRIER(cur->l_mutex, 1);
312 }
313 
314 /*
315  * turnstile_unlendpri: undo turnstile_lendpri
316  */
317 
318 static void
319 turnstile_unlendpri(turnstile_t *ts)
320 {
321 	lwp_t * const l = curlwp;
322 	turnstile_t *iter;
323 	turnstile_t *next;
324 	turnstile_t *prev = NULL;
325 	pri_t prio;
326 	bool dolock;
327 
328 	KASSERT(ts->ts_inheritor != NULL);
329 	ts->ts_inheritor = NULL;
330 	dolock = l->l_mutex == l->l_cpu->ci_schedstate.spc_lwplock;
331 	if (dolock) {
332 		lwp_lock(l);
333 	}
334 
335 	/*
336 	 * the following loop does two things.
337 	 *
338 	 * - remove ts from the list.
339 	 *
340 	 * - from the rest of the list, find the highest priority.
341 	 */
342 
343 	prio = -1;
344 	KASSERT(!SLIST_EMPTY(&l->l_pi_lenders));
345 	for (iter = SLIST_FIRST(&l->l_pi_lenders);
346 	    iter != NULL; iter = next) {
347 		KASSERT(lwp_eprio(l) >= ts->ts_eprio);
348 		next = SLIST_NEXT(iter, ts_pichain);
349 		if (iter == ts) {
350 			if (prev == NULL) {
351 				SLIST_REMOVE_HEAD(&l->l_pi_lenders,
352 				    ts_pichain);
353 			} else {
354 				SLIST_REMOVE_AFTER(prev, ts_pichain);
355 			}
356 		} else if (prio < iter->ts_eprio) {
357 			prio = iter->ts_eprio;
358 		}
359 		prev = iter;
360 	}
361 
362 	lwp_lendpri(l, prio);
363 
364 	if (dolock) {
365 		lwp_unlock(l);
366 	}
367 }
368 
369 /*
370  * turnstile_block:
371  *
372  *	 Enter an object into the turnstile chain and prepare the current
373  *	 LWP for sleep.
374  */
375 void
376 turnstile_block(turnstile_t *ts, int q, wchan_t obj, syncobj_t *sobj)
377 {
378 	lwp_t * const l = curlwp; /* cached curlwp */
379 	turnstile_t *ots;
380 	tschain_t *tc;
381 	kmutex_t *lock;
382 	sleepq_t *sq;
383 	pri_t obase;
384 	u_int hash;
385 
386 	hash = TS_HASH(obj);
387 	tc = &turnstile_chains[hash];
388 	lock = &turnstile_locks[hash].lock;
389 
390 	KASSERT(q == TS_READER_Q || q == TS_WRITER_Q);
391 	KASSERT(mutex_owned(lock));
392 	KASSERT(l != NULL && l->l_ts != NULL);
393 
394 	if (ts == NULL) {
395 		/*
396 		 * We are the first thread to wait for this object;
397 		 * lend our turnstile to it.
398 		 */
399 		ts = l->l_ts;
400 		KASSERT(TS_ALL_WAITERS(ts) == 0);
401 		KASSERT(LIST_EMPTY(&ts->ts_sleepq[TS_READER_Q]) &&
402 			LIST_EMPTY(&ts->ts_sleepq[TS_WRITER_Q]));
403 		ts->ts_obj = obj;
404 		ts->ts_inheritor = NULL;
405 		LIST_INSERT_HEAD(tc, ts, ts_chain);
406 	} else {
407 		/*
408 		 * Object already has a turnstile.  Put our turnstile
409 		 * onto the free list, and reference the existing
410 		 * turnstile instead.
411 		 */
412 		ots = l->l_ts;
413 		KASSERT(ots->ts_free == NULL);
414 		ots->ts_free = ts->ts_free;
415 		ts->ts_free = ots;
416 		l->l_ts = ts;
417 
418 		KASSERT(ts->ts_obj == obj);
419 		KASSERT(TS_ALL_WAITERS(ts) != 0);
420 		KASSERT(!LIST_EMPTY(&ts->ts_sleepq[TS_READER_Q]) ||
421 			!LIST_EMPTY(&ts->ts_sleepq[TS_WRITER_Q]));
422 	}
423 
424 	sq = &ts->ts_sleepq[q];
425 	ts->ts_waiters[q]++;
426 	sleepq_enter(sq, l, lock);
427 	LOCKDEBUG_BARRIER(lock, 1);
428 	l->l_kpriority = true;
429 	obase = l->l_kpribase;
430 	if (obase < PRI_KTHREAD)
431 		l->l_kpribase = PRI_KTHREAD;
432 	sleepq_enqueue(sq, obj, "tstile", sobj);
433 
434 	/*
435 	 * Disable preemption across this entire block, as we may drop
436 	 * scheduler locks (allowing preemption), and would prefer not
437 	 * to be interrupted while in a state of flux.
438 	 */
439 	KPREEMPT_DISABLE(l);
440 	KASSERT(lock == l->l_mutex);
441 	turnstile_lendpri(l);
442 	sleepq_block(0, false);
443 	l->l_kpribase = obase;
444 	KPREEMPT_ENABLE(l);
445 }
446 
447 /*
448  * turnstile_wakeup:
449  *
450  *	Wake up the specified number of threads that are blocked
451  *	in a turnstile.
452  */
453 void
454 turnstile_wakeup(turnstile_t *ts, int q, int count, lwp_t *nl)
455 {
456 	sleepq_t *sq;
457 	kmutex_t *lock;
458 	u_int hash;
459 	lwp_t *l;
460 
461 	hash = TS_HASH(ts->ts_obj);
462 	lock = &turnstile_locks[hash].lock;
463 	sq = &ts->ts_sleepq[q];
464 
465 	KASSERT(q == TS_READER_Q || q == TS_WRITER_Q);
466 	KASSERT(count > 0 && count <= TS_WAITERS(ts, q));
467 	KASSERT(mutex_owned(lock));
468 	KASSERT(ts->ts_inheritor == curlwp || ts->ts_inheritor == NULL);
469 
470 	/*
471 	 * restore inherited priority if necessary.
472 	 */
473 
474 	if (ts->ts_inheritor != NULL) {
475 		turnstile_unlendpri(ts);
476 	}
477 
478 	if (nl != NULL) {
479 #if defined(DEBUG) || defined(LOCKDEBUG)
480 		LIST_FOREACH(l, sq, l_sleepchain) {
481 			if (l == nl)
482 				break;
483 		}
484 		if (l == NULL)
485 			panic("turnstile_wakeup: nl not on sleepq");
486 #endif
487 		turnstile_remove(ts, nl, q);
488 	} else {
489 		while (count-- > 0) {
490 			l = LIST_FIRST(sq);
491 			KASSERT(l != NULL);
492 			turnstile_remove(ts, l, q);
493 		}
494 	}
495 	mutex_spin_exit(lock);
496 }
497 
498 /*
499  * turnstile_unsleep:
500  *
501  *	Remove an LWP from the turnstile.  This is called when the LWP has
502  *	not been awoken normally but instead interrupted: for example, if it
503  *	has received a signal.  It's not a valid action for turnstiles,
504  *	since LWPs blocking on a turnstile are not interruptable.
505  */
506 void
507 turnstile_unsleep(lwp_t *l, bool cleanup)
508 {
509 
510 	lwp_unlock(l);
511 	panic("turnstile_unsleep");
512 }
513 
514 /*
515  * turnstile_changepri:
516  *
517  *	Adjust the priority of an LWP residing on a turnstile.
518  */
519 void
520 turnstile_changepri(lwp_t *l, pri_t pri)
521 {
522 
523 	/* XXX priority inheritance */
524 	sleepq_changepri(l, pri);
525 }
526 
527 #if defined(LOCKDEBUG)
528 /*
529  * turnstile_print:
530  *
531  *	Given the address of a lock object, print the contents of a
532  *	turnstile.
533  */
534 void
535 turnstile_print(volatile void *obj, void (*pr)(const char *, ...))
536 {
537 	turnstile_t *ts;
538 	tschain_t *tc;
539 	sleepq_t *rsq, *wsq;
540 	u_int hash;
541 	lwp_t *l;
542 
543 	hash = TS_HASH(obj);
544 	tc = &turnstile_chains[hash];
545 
546 	LIST_FOREACH(ts, tc, ts_chain)
547 		if (ts->ts_obj == obj)
548 			break;
549 
550 	if (ts == NULL) {
551 		(*pr)("Turnstile: no active turnstile for this lock.\n");
552 		return;
553 	}
554 
555 	rsq = &ts->ts_sleepq[TS_READER_Q];
556 	wsq = &ts->ts_sleepq[TS_WRITER_Q];
557 
558 	(*pr)("Turnstile:\n");
559 	(*pr)("=> %d waiting readers:", TS_WAITERS(ts, TS_READER_Q));
560 	LIST_FOREACH(l, rsq, l_sleepchain) {
561 		(*pr)(" %p", l);
562 	}
563 	(*pr)("\n");
564 
565 	(*pr)("=> %d waiting writers:", TS_WAITERS(ts, TS_WRITER_Q));
566 	LIST_FOREACH(l, wsq, l_sleepchain) {
567 		(*pr)(" %p", l);
568 	}
569 	(*pr)("\n");
570 }
571 #endif	/* LOCKDEBUG */
572