1 /* $NetBSD: kern_turnstile.c,v 1.55 2023/10/15 10:30:20 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2002, 2006, 2007, 2009, 2019, 2020, 2023 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Jason R. Thorpe and Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Turnstiles are described in detail in: 35 * 36 * Solaris Internals: Core Kernel Architecture, Jim Mauro and 37 * Richard McDougall. 38 * 39 * Turnstiles are kept in a hash table. There are likely to be many more 40 * synchronisation objects than there are threads. Since a thread can block 41 * on only one lock at a time, we only need one turnstile per thread, and 42 * so they are allocated at thread creation time. 43 * 44 * When a thread decides it needs to block on a lock, it looks up the 45 * active turnstile for that lock. If no active turnstile exists, then 46 * the process lends its turnstile to the lock. If there is already an 47 * active turnstile for the lock, the thread places its turnstile on a 48 * list of free turnstiles, and references the active one instead. 49 * 50 * The act of looking up the turnstile acquires an interlock on the sleep 51 * queue. If a thread decides it doesn't need to block after all, then this 52 * interlock must be released by explicitly aborting the turnstile 53 * operation. 54 * 55 * When a thread is awakened, it needs to get its turnstile back. If there 56 * are still other threads waiting in the active turnstile, the thread 57 * grabs a free turnstile off the free list. Otherwise, it can take back 58 * the active turnstile from the lock (thus deactivating the turnstile). 59 * 60 * Turnstiles are where we do priority inheritence. 61 */ 62 63 #include <sys/cdefs.h> 64 __KERNEL_RCSID(0, "$NetBSD: kern_turnstile.c,v 1.55 2023/10/15 10:30:20 riastradh Exp $"); 65 66 #include <sys/param.h> 67 68 #include <sys/lockdebug.h> 69 #include <sys/lwp.h> 70 #include <sys/proc.h> 71 #include <sys/sleepq.h> 72 #include <sys/sleeptab.h> 73 #include <sys/syncobj.h> 74 #include <sys/systm.h> 75 76 /* 77 * Shift of 6 aligns to typical cache line size of 64 bytes; there's no 78 * point having two turnstile locks to back two lock objects that share one 79 * cache line. 80 */ 81 #define TS_HASH_SIZE 128 82 #define TS_HASH_MASK (TS_HASH_SIZE - 1) 83 #define TS_HASH(obj) (((uintptr_t)(obj) >> 6) & TS_HASH_MASK) 84 85 static tschain_t turnstile_chains[TS_HASH_SIZE] __cacheline_aligned; 86 87 static union { 88 kmutex_t lock; 89 uint8_t pad[COHERENCY_UNIT]; 90 } turnstile_locks[TS_HASH_SIZE] __cacheline_aligned; 91 92 /* 93 * turnstile_init: 94 * 95 * Initialize the turnstile mechanism. 96 */ 97 void 98 turnstile_init(void) 99 { 100 int i; 101 102 for (i = 0; i < TS_HASH_SIZE; i++) { 103 LIST_INIT(&turnstile_chains[i]); 104 mutex_init(&turnstile_locks[i].lock, MUTEX_DEFAULT, IPL_SCHED); 105 } 106 107 turnstile_ctor(&turnstile0); 108 } 109 110 /* 111 * turnstile_ctor: 112 * 113 * Constructor for turnstiles. 114 */ 115 void 116 turnstile_ctor(turnstile_t *ts) 117 { 118 119 memset(ts, 0, sizeof(*ts)); 120 sleepq_init(&ts->ts_sleepq[TS_READER_Q]); 121 sleepq_init(&ts->ts_sleepq[TS_WRITER_Q]); 122 } 123 124 /* 125 * turnstile_remove: 126 * 127 * Remove an LWP from a turnstile sleep queue and wake it. 128 */ 129 static inline void 130 turnstile_remove(turnstile_t *ts, lwp_t *l, int q) 131 { 132 turnstile_t *nts; 133 134 KASSERT(l->l_ts == ts); 135 136 /* 137 * This process is no longer using the active turnstile. 138 * Find an inactive one on the free list to give to it. 139 */ 140 if ((nts = ts->ts_free) != NULL) { 141 KASSERT(TS_ALL_WAITERS(ts) > 1); 142 l->l_ts = nts; 143 ts->ts_free = nts->ts_free; 144 nts->ts_free = NULL; 145 } else { 146 /* 147 * If the free list is empty, this is the last 148 * waiter. 149 */ 150 KASSERT(TS_ALL_WAITERS(ts) == 1); 151 LIST_REMOVE(ts, ts_chain); 152 } 153 154 ts->ts_waiters[q]--; 155 sleepq_remove(&ts->ts_sleepq[q], l, true); 156 } 157 158 /* 159 * turnstile_lookup: 160 * 161 * Look up the turnstile for the specified lock. This acquires and 162 * holds the turnstile chain lock (sleep queue interlock). 163 */ 164 turnstile_t * 165 turnstile_lookup(wchan_t obj) 166 { 167 turnstile_t *ts; 168 tschain_t *tc; 169 u_int hash; 170 171 hash = TS_HASH(obj); 172 tc = &turnstile_chains[hash]; 173 mutex_spin_enter(&turnstile_locks[hash].lock); 174 175 LIST_FOREACH(ts, tc, ts_chain) 176 if (ts->ts_obj == obj) 177 return (ts); 178 179 /* 180 * No turnstile yet for this lock. No problem, turnstile_block() 181 * handles this by fetching the turnstile from the blocking thread. 182 */ 183 return (NULL); 184 } 185 186 /* 187 * turnstile_exit: 188 * 189 * Abort a turnstile operation. 190 */ 191 void 192 turnstile_exit(wchan_t obj) 193 { 194 195 mutex_spin_exit(&turnstile_locks[TS_HASH(obj)].lock); 196 } 197 198 /* 199 * turnstile_lendpri: 200 * 201 * Lend our priority to lwps on the blocking chain. 202 * 203 * If the current owner of the lock (l->l_wchan, set by sleepq_enqueue) 204 * has a priority lower than ours (lwp_eprio(l)), lend our priority to 205 * him to avoid priority inversions. 206 */ 207 208 static void 209 turnstile_lendpri(lwp_t *cur) 210 { 211 lwp_t * l = cur; 212 pri_t prio; 213 214 /* 215 * NOTE: if you get a panic in this code block, it is likely that 216 * a lock has been destroyed or corrupted while still in use. Try 217 * compiling a kernel with LOCKDEBUG to pinpoint the problem. 218 */ 219 220 LOCKDEBUG_BARRIER(l->l_mutex, 1); 221 KASSERT(l == curlwp); 222 prio = lwp_eprio(l); 223 for (;;) { 224 lwp_t *owner; 225 turnstile_t *ts; 226 bool dolock; 227 228 if (l->l_wchan == NULL) 229 break; 230 231 /* 232 * Ask syncobj the owner of the lock. 233 */ 234 owner = (*l->l_syncobj->sobj_owner)(l->l_wchan); 235 if (owner == NULL) 236 break; 237 238 /* 239 * The owner may have changed as we have dropped the tc lock. 240 */ 241 if (cur == owner) { 242 /* 243 * We own the lock: stop here, sleepq_block() 244 * should wake up immediately. 245 */ 246 break; 247 } 248 /* 249 * Acquire owner->l_mutex if we don't have it yet. 250 * Because we already have another LWP lock (l->l_mutex) held, 251 * we need to play a try lock dance to avoid deadlock. 252 */ 253 dolock = l->l_mutex != atomic_load_relaxed(&owner->l_mutex); 254 if (l == owner || (dolock && !lwp_trylock(owner))) { 255 /* 256 * The owner was changed behind us or trylock failed. 257 * Restart from curlwp. 258 * 259 * Note that there may be a livelock here: 260 * the owner may try grabbing cur's lock (which is the 261 * tc lock) while we're trying to grab the owner's lock. 262 */ 263 lwp_unlock(l); 264 l = cur; 265 lwp_lock(l); 266 prio = lwp_eprio(l); 267 continue; 268 } 269 /* 270 * If the owner's priority is already higher than ours, 271 * there's nothing to do anymore. 272 */ 273 if (prio <= lwp_eprio(owner)) { 274 if (dolock) 275 lwp_unlock(owner); 276 break; 277 } 278 /* 279 * Lend our priority to the 'owner' LWP. 280 * 281 * Update lenders info for turnstile_unlendpri. 282 */ 283 ts = l->l_ts; 284 KASSERT(ts->ts_inheritor == owner || ts->ts_inheritor == NULL); 285 if (ts->ts_inheritor == NULL) { 286 ts->ts_inheritor = owner; 287 ts->ts_eprio = prio; 288 SLIST_INSERT_HEAD(&owner->l_pi_lenders, ts, ts_pichain); 289 lwp_lendpri(owner, prio); 290 } else if (prio > ts->ts_eprio) { 291 ts->ts_eprio = prio; 292 lwp_lendpri(owner, prio); 293 } 294 if (dolock) 295 lwp_unlock(l); 296 LOCKDEBUG_BARRIER(owner->l_mutex, 1); 297 l = owner; 298 } 299 LOCKDEBUG_BARRIER(l->l_mutex, 1); 300 if (cur->l_mutex != atomic_load_relaxed(&l->l_mutex)) { 301 lwp_unlock(l); 302 lwp_lock(cur); 303 } 304 LOCKDEBUG_BARRIER(cur->l_mutex, 1); 305 } 306 307 /* 308 * turnstile_unlendpri: undo turnstile_lendpri 309 */ 310 311 static void 312 turnstile_unlendpri(turnstile_t *ts) 313 { 314 lwp_t * const l = curlwp; 315 turnstile_t *iter; 316 turnstile_t *next; 317 turnstile_t *prev = NULL; 318 pri_t prio; 319 bool dolock; 320 321 KASSERT(ts->ts_inheritor != NULL); 322 ts->ts_inheritor = NULL; 323 dolock = (atomic_load_relaxed(&l->l_mutex) == 324 l->l_cpu->ci_schedstate.spc_lwplock); 325 if (dolock) { 326 lwp_lock(l); 327 } 328 329 /* 330 * the following loop does two things. 331 * 332 * - remove ts from the list. 333 * 334 * - from the rest of the list, find the highest priority. 335 */ 336 337 prio = -1; 338 KASSERT(!SLIST_EMPTY(&l->l_pi_lenders)); 339 for (iter = SLIST_FIRST(&l->l_pi_lenders); 340 iter != NULL; iter = next) { 341 KASSERT(lwp_eprio(l) >= ts->ts_eprio); 342 next = SLIST_NEXT(iter, ts_pichain); 343 if (iter == ts) { 344 if (prev == NULL) { 345 SLIST_REMOVE_HEAD(&l->l_pi_lenders, 346 ts_pichain); 347 } else { 348 SLIST_REMOVE_AFTER(prev, ts_pichain); 349 } 350 } else if (prio < iter->ts_eprio) { 351 prio = iter->ts_eprio; 352 } 353 prev = iter; 354 } 355 356 lwp_lendpri(l, prio); 357 358 if (dolock) { 359 lwp_unlock(l); 360 } 361 } 362 363 /* 364 * turnstile_block: 365 * 366 * Enter an object into the turnstile chain and prepare the current 367 * LWP for sleep. 368 */ 369 void 370 turnstile_block(turnstile_t *ts, int q, wchan_t obj, syncobj_t *sobj) 371 { 372 lwp_t * const l = curlwp; /* cached curlwp */ 373 turnstile_t *ots; 374 tschain_t *tc; 375 kmutex_t *lock; 376 sleepq_t *sq; 377 u_int hash; 378 int nlocks; 379 380 hash = TS_HASH(obj); 381 tc = &turnstile_chains[hash]; 382 lock = &turnstile_locks[hash].lock; 383 384 KASSERT(q == TS_READER_Q || q == TS_WRITER_Q); 385 KASSERT(mutex_owned(lock)); 386 KASSERT(l != NULL); 387 KASSERT(l->l_ts != NULL); 388 389 if (ts == NULL) { 390 /* 391 * We are the first thread to wait for this object; 392 * lend our turnstile to it. 393 */ 394 ts = l->l_ts; 395 KASSERT(TS_ALL_WAITERS(ts) == 0); 396 KASSERT(LIST_EMPTY(&ts->ts_sleepq[TS_READER_Q])); 397 KASSERT(LIST_EMPTY(&ts->ts_sleepq[TS_WRITER_Q])); 398 ts->ts_obj = obj; 399 ts->ts_inheritor = NULL; 400 LIST_INSERT_HEAD(tc, ts, ts_chain); 401 } else { 402 /* 403 * Object already has a turnstile. Put our turnstile 404 * onto the free list, and reference the existing 405 * turnstile instead. 406 */ 407 ots = l->l_ts; 408 KASSERT(ots->ts_free == NULL); 409 ots->ts_free = ts->ts_free; 410 ts->ts_free = ots; 411 l->l_ts = ts; 412 413 KASSERT(ts->ts_obj == obj); 414 KASSERT(TS_ALL_WAITERS(ts) != 0); 415 KASSERT(!LIST_EMPTY(&ts->ts_sleepq[TS_READER_Q]) || 416 !LIST_EMPTY(&ts->ts_sleepq[TS_WRITER_Q])); 417 } 418 419 sq = &ts->ts_sleepq[q]; 420 ts->ts_waiters[q]++; 421 nlocks = sleepq_enter(sq, l, lock); 422 LOCKDEBUG_BARRIER(lock, 1); 423 sleepq_enqueue(sq, obj, sobj->sobj_name, sobj, false); 424 425 /* 426 * Disable preemption across this entire block, as we may drop 427 * scheduler locks (allowing preemption), and would prefer not 428 * to be interrupted while in a state of flux. 429 */ 430 KPREEMPT_DISABLE(l); 431 KASSERT(lock == l->l_mutex); 432 turnstile_lendpri(l); 433 sleepq_block(0, false, sobj, nlocks); 434 KPREEMPT_ENABLE(l); 435 } 436 437 /* 438 * turnstile_wakeup: 439 * 440 * Wake up the specified number of threads that are blocked 441 * in a turnstile. 442 */ 443 void 444 turnstile_wakeup(turnstile_t *ts, int q, int count, lwp_t *nl) 445 { 446 sleepq_t *sq; 447 kmutex_t *lock; 448 u_int hash; 449 lwp_t *l; 450 451 hash = TS_HASH(ts->ts_obj); 452 lock = &turnstile_locks[hash].lock; 453 sq = &ts->ts_sleepq[q]; 454 455 KASSERT(q == TS_READER_Q || q == TS_WRITER_Q); 456 KASSERT(count > 0); 457 KASSERT(count <= TS_WAITERS(ts, q)); 458 KASSERT(mutex_owned(lock)); 459 KASSERT(ts->ts_inheritor == curlwp || ts->ts_inheritor == NULL); 460 461 /* 462 * restore inherited priority if necessary. 463 */ 464 465 if (ts->ts_inheritor != NULL) { 466 turnstile_unlendpri(ts); 467 } 468 469 if (nl != NULL) { 470 #if defined(DEBUG) || defined(LOCKDEBUG) 471 LIST_FOREACH(l, sq, l_sleepchain) { 472 if (l == nl) 473 break; 474 } 475 if (l == NULL) 476 panic("turnstile_wakeup: nl not on sleepq"); 477 #endif 478 turnstile_remove(ts, nl, q); 479 } else { 480 while (count-- > 0) { 481 l = LIST_FIRST(sq); 482 KASSERT(l != NULL); 483 turnstile_remove(ts, l, q); 484 } 485 } 486 mutex_spin_exit(lock); 487 } 488 489 /* 490 * turnstile_unsleep: 491 * 492 * Remove an LWP from the turnstile. This is called when the LWP has 493 * not been awoken normally but instead interrupted: for example, if it 494 * has received a signal. It's not a valid action for turnstiles, 495 * since LWPs blocking on a turnstile are not interruptable. 496 */ 497 void 498 turnstile_unsleep(lwp_t *l, bool cleanup) 499 { 500 501 lwp_unlock(l); 502 panic("turnstile_unsleep"); 503 } 504 505 /* 506 * turnstile_changepri: 507 * 508 * Adjust the priority of an LWP residing on a turnstile. 509 */ 510 void 511 turnstile_changepri(lwp_t *l, pri_t pri) 512 { 513 514 /* XXX priority inheritance */ 515 sleepq_changepri(l, pri); 516 } 517 518 #if defined(LOCKDEBUG) 519 /* 520 * turnstile_print: 521 * 522 * Given the address of a lock object, print the contents of a 523 * turnstile. 524 */ 525 void 526 turnstile_print(volatile void *obj, void (*pr)(const char *, ...)) 527 { 528 turnstile_t *ts; 529 tschain_t *tc; 530 sleepq_t *rsq, *wsq; 531 u_int hash; 532 lwp_t *l; 533 534 hash = TS_HASH(obj); 535 tc = &turnstile_chains[hash]; 536 537 LIST_FOREACH(ts, tc, ts_chain) 538 if (ts->ts_obj == obj) 539 break; 540 541 if (ts == NULL) { 542 (*pr)("Turnstile: no active turnstile for this lock.\n"); 543 return; 544 } 545 546 rsq = &ts->ts_sleepq[TS_READER_Q]; 547 wsq = &ts->ts_sleepq[TS_WRITER_Q]; 548 549 (*pr)("Turnstile:\n"); 550 (*pr)("=> %d waiting readers:", TS_WAITERS(ts, TS_READER_Q)); 551 LIST_FOREACH(l, rsq, l_sleepchain) { 552 (*pr)(" %p", l); 553 } 554 (*pr)("\n"); 555 556 (*pr)("=> %d waiting writers:", TS_WAITERS(ts, TS_WRITER_Q)); 557 LIST_FOREACH(l, wsq, l_sleepchain) { 558 (*pr)(" %p", l); 559 } 560 (*pr)("\n"); 561 } 562 #endif /* LOCKDEBUG */ 563