1 /* $NetBSD: kern_turnstile.c,v 1.45 2022/10/26 23:27:16 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2002, 2006, 2007, 2009, 2019, 2020 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Jason R. Thorpe and Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Turnstiles are described in detail in: 35 * 36 * Solaris Internals: Core Kernel Architecture, Jim Mauro and 37 * Richard McDougall. 38 * 39 * Turnstiles are kept in a hash table. There are likely to be many more 40 * synchronisation objects than there are threads. Since a thread can block 41 * on only one lock at a time, we only need one turnstile per thread, and 42 * so they are allocated at thread creation time. 43 * 44 * When a thread decides it needs to block on a lock, it looks up the 45 * active turnstile for that lock. If no active turnstile exists, then 46 * the process lends its turnstile to the lock. If there is already an 47 * active turnstile for the lock, the thread places its turnstile on a 48 * list of free turnstiles, and references the active one instead. 49 * 50 * The act of looking up the turnstile acquires an interlock on the sleep 51 * queue. If a thread decides it doesn't need to block after all, then this 52 * interlock must be released by explicitly aborting the turnstile 53 * operation. 54 * 55 * When a thread is awakened, it needs to get its turnstile back. If there 56 * are still other threads waiting in the active turnstile, the thread 57 * grabs a free turnstile off the free list. Otherwise, it can take back 58 * the active turnstile from the lock (thus deactivating the turnstile). 59 * 60 * Turnstiles are where we do priority inheritence. 61 */ 62 63 #include <sys/cdefs.h> 64 __KERNEL_RCSID(0, "$NetBSD: kern_turnstile.c,v 1.45 2022/10/26 23:27:16 riastradh Exp $"); 65 66 #include <sys/param.h> 67 #include <sys/lockdebug.h> 68 #include <sys/pool.h> 69 #include <sys/proc.h> 70 #include <sys/sleepq.h> 71 #include <sys/sleeptab.h> 72 #include <sys/systm.h> 73 74 /* 75 * Shift of 6 aligns to typical cache line size of 64 bytes; there's no 76 * point having two turnstile locks to back two lock objects that share one 77 * cache line. 78 */ 79 #define TS_HASH_SIZE 128 80 #define TS_HASH_MASK (TS_HASH_SIZE - 1) 81 #define TS_HASH(obj) (((uintptr_t)(obj) >> 6) & TS_HASH_MASK) 82 83 static tschain_t turnstile_chains[TS_HASH_SIZE] __cacheline_aligned; 84 struct pool turnstile_pool; 85 86 static union { 87 kmutex_t lock; 88 uint8_t pad[COHERENCY_UNIT]; 89 } turnstile_locks[TS_HASH_SIZE] __cacheline_aligned; 90 91 /* 92 * turnstile_init: 93 * 94 * Initialize the turnstile mechanism. 95 */ 96 void 97 turnstile_init(void) 98 { 99 int i; 100 101 for (i = 0; i < TS_HASH_SIZE; i++) { 102 LIST_INIT(&turnstile_chains[i]); 103 mutex_init(&turnstile_locks[i].lock, MUTEX_DEFAULT, IPL_SCHED); 104 } 105 106 pool_init(&turnstile_pool, sizeof(turnstile_t), coherency_unit, 107 0, 0, "tstile", NULL, IPL_NONE); 108 109 turnstile_ctor(&turnstile0); 110 } 111 112 /* 113 * turnstile_ctor: 114 * 115 * Constructor for turnstiles. 116 */ 117 void 118 turnstile_ctor(turnstile_t *ts) 119 { 120 121 memset(ts, 0, sizeof(*ts)); 122 sleepq_init(&ts->ts_sleepq[TS_READER_Q]); 123 sleepq_init(&ts->ts_sleepq[TS_WRITER_Q]); 124 } 125 126 /* 127 * turnstile_remove: 128 * 129 * Remove an LWP from a turnstile sleep queue and wake it. 130 */ 131 static inline void 132 turnstile_remove(turnstile_t *ts, lwp_t *l, int q) 133 { 134 turnstile_t *nts; 135 136 KASSERT(l->l_ts == ts); 137 138 /* 139 * This process is no longer using the active turnstile. 140 * Find an inactive one on the free list to give to it. 141 */ 142 if ((nts = ts->ts_free) != NULL) { 143 KASSERT(TS_ALL_WAITERS(ts) > 1); 144 l->l_ts = nts; 145 ts->ts_free = nts->ts_free; 146 nts->ts_free = NULL; 147 } else { 148 /* 149 * If the free list is empty, this is the last 150 * waiter. 151 */ 152 KASSERT(TS_ALL_WAITERS(ts) == 1); 153 LIST_REMOVE(ts, ts_chain); 154 } 155 156 ts->ts_waiters[q]--; 157 sleepq_remove(&ts->ts_sleepq[q], l); 158 } 159 160 /* 161 * turnstile_lookup: 162 * 163 * Look up the turnstile for the specified lock. This acquires and 164 * holds the turnstile chain lock (sleep queue interlock). 165 */ 166 turnstile_t * 167 turnstile_lookup(wchan_t obj) 168 { 169 turnstile_t *ts; 170 tschain_t *tc; 171 u_int hash; 172 173 hash = TS_HASH(obj); 174 tc = &turnstile_chains[hash]; 175 mutex_spin_enter(&turnstile_locks[hash].lock); 176 177 LIST_FOREACH(ts, tc, ts_chain) 178 if (ts->ts_obj == obj) 179 return (ts); 180 181 /* 182 * No turnstile yet for this lock. No problem, turnstile_block() 183 * handles this by fetching the turnstile from the blocking thread. 184 */ 185 return (NULL); 186 } 187 188 /* 189 * turnstile_exit: 190 * 191 * Abort a turnstile operation. 192 */ 193 void 194 turnstile_exit(wchan_t obj) 195 { 196 197 mutex_spin_exit(&turnstile_locks[TS_HASH(obj)].lock); 198 } 199 200 /* 201 * turnstile_lendpri: 202 * 203 * Lend our priority to lwps on the blocking chain. 204 * 205 * If the current owner of the lock (l->l_wchan, set by sleepq_enqueue) 206 * has a priority lower than ours (lwp_eprio(l)), lend our priority to 207 * him to avoid priority inversions. 208 */ 209 210 static void 211 turnstile_lendpri(lwp_t *cur) 212 { 213 lwp_t * l = cur; 214 pri_t prio; 215 216 /* 217 * NOTE: if you get a panic in this code block, it is likely that 218 * a lock has been destroyed or corrupted while still in use. Try 219 * compiling a kernel with LOCKDEBUG to pinpoint the problem. 220 */ 221 222 LOCKDEBUG_BARRIER(l->l_mutex, 1); 223 KASSERT(l == curlwp); 224 prio = lwp_eprio(l); 225 for (;;) { 226 lwp_t *owner; 227 turnstile_t *ts; 228 bool dolock; 229 230 if (l->l_wchan == NULL) 231 break; 232 233 /* 234 * Ask syncobj the owner of the lock. 235 */ 236 owner = (*l->l_syncobj->sobj_owner)(l->l_wchan); 237 if (owner == NULL) 238 break; 239 240 /* 241 * The owner may have changed as we have dropped the tc lock. 242 */ 243 if (cur == owner) { 244 /* 245 * We own the lock: stop here, sleepq_block() 246 * should wake up immediately. 247 */ 248 break; 249 } 250 /* 251 * Acquire owner->l_mutex if we don't have it yet. 252 * Because we already have another LWP lock (l->l_mutex) held, 253 * we need to play a try lock dance to avoid deadlock. 254 */ 255 dolock = l->l_mutex != atomic_load_relaxed(&owner->l_mutex); 256 if (l == owner || (dolock && !lwp_trylock(owner))) { 257 /* 258 * The owner was changed behind us or trylock failed. 259 * Restart from curlwp. 260 * 261 * Note that there may be a livelock here: 262 * the owner may try grabbing cur's lock (which is the 263 * tc lock) while we're trying to grab the owner's lock. 264 */ 265 lwp_unlock(l); 266 l = cur; 267 lwp_lock(l); 268 prio = lwp_eprio(l); 269 continue; 270 } 271 /* 272 * If the owner's priority is already higher than ours, 273 * there's nothing to do anymore. 274 */ 275 if (prio <= lwp_eprio(owner)) { 276 if (dolock) 277 lwp_unlock(owner); 278 break; 279 } 280 /* 281 * Lend our priority to the 'owner' LWP. 282 * 283 * Update lenders info for turnstile_unlendpri. 284 */ 285 ts = l->l_ts; 286 KASSERT(ts->ts_inheritor == owner || ts->ts_inheritor == NULL); 287 if (ts->ts_inheritor == NULL) { 288 ts->ts_inheritor = owner; 289 ts->ts_eprio = prio; 290 SLIST_INSERT_HEAD(&owner->l_pi_lenders, ts, ts_pichain); 291 lwp_lendpri(owner, prio); 292 } else if (prio > ts->ts_eprio) { 293 ts->ts_eprio = prio; 294 lwp_lendpri(owner, prio); 295 } 296 if (dolock) 297 lwp_unlock(l); 298 LOCKDEBUG_BARRIER(owner->l_mutex, 1); 299 l = owner; 300 } 301 LOCKDEBUG_BARRIER(l->l_mutex, 1); 302 if (cur->l_mutex != atomic_load_relaxed(&l->l_mutex)) { 303 lwp_unlock(l); 304 lwp_lock(cur); 305 } 306 LOCKDEBUG_BARRIER(cur->l_mutex, 1); 307 } 308 309 /* 310 * turnstile_unlendpri: undo turnstile_lendpri 311 */ 312 313 static void 314 turnstile_unlendpri(turnstile_t *ts) 315 { 316 lwp_t * const l = curlwp; 317 turnstile_t *iter; 318 turnstile_t *next; 319 turnstile_t *prev = NULL; 320 pri_t prio; 321 bool dolock; 322 323 KASSERT(ts->ts_inheritor != NULL); 324 ts->ts_inheritor = NULL; 325 dolock = (atomic_load_relaxed(&l->l_mutex) == 326 l->l_cpu->ci_schedstate.spc_lwplock); 327 if (dolock) { 328 lwp_lock(l); 329 } 330 331 /* 332 * the following loop does two things. 333 * 334 * - remove ts from the list. 335 * 336 * - from the rest of the list, find the highest priority. 337 */ 338 339 prio = -1; 340 KASSERT(!SLIST_EMPTY(&l->l_pi_lenders)); 341 for (iter = SLIST_FIRST(&l->l_pi_lenders); 342 iter != NULL; iter = next) { 343 KASSERT(lwp_eprio(l) >= ts->ts_eprio); 344 next = SLIST_NEXT(iter, ts_pichain); 345 if (iter == ts) { 346 if (prev == NULL) { 347 SLIST_REMOVE_HEAD(&l->l_pi_lenders, 348 ts_pichain); 349 } else { 350 SLIST_REMOVE_AFTER(prev, ts_pichain); 351 } 352 } else if (prio < iter->ts_eprio) { 353 prio = iter->ts_eprio; 354 } 355 prev = iter; 356 } 357 358 lwp_lendpri(l, prio); 359 360 if (dolock) { 361 lwp_unlock(l); 362 } 363 } 364 365 /* 366 * turnstile_block: 367 * 368 * Enter an object into the turnstile chain and prepare the current 369 * LWP for sleep. 370 */ 371 void 372 turnstile_block(turnstile_t *ts, int q, wchan_t obj, syncobj_t *sobj) 373 { 374 lwp_t * const l = curlwp; /* cached curlwp */ 375 turnstile_t *ots; 376 tschain_t *tc; 377 kmutex_t *lock; 378 sleepq_t *sq; 379 pri_t obase; 380 u_int hash; 381 382 hash = TS_HASH(obj); 383 tc = &turnstile_chains[hash]; 384 lock = &turnstile_locks[hash].lock; 385 386 KASSERT(q == TS_READER_Q || q == TS_WRITER_Q); 387 KASSERT(mutex_owned(lock)); 388 KASSERT(l != NULL && l->l_ts != NULL); 389 390 if (ts == NULL) { 391 /* 392 * We are the first thread to wait for this object; 393 * lend our turnstile to it. 394 */ 395 ts = l->l_ts; 396 KASSERT(TS_ALL_WAITERS(ts) == 0); 397 KASSERT(LIST_EMPTY(&ts->ts_sleepq[TS_READER_Q]) && 398 LIST_EMPTY(&ts->ts_sleepq[TS_WRITER_Q])); 399 ts->ts_obj = obj; 400 ts->ts_inheritor = NULL; 401 LIST_INSERT_HEAD(tc, ts, ts_chain); 402 } else { 403 /* 404 * Object already has a turnstile. Put our turnstile 405 * onto the free list, and reference the existing 406 * turnstile instead. 407 */ 408 ots = l->l_ts; 409 KASSERT(ots->ts_free == NULL); 410 ots->ts_free = ts->ts_free; 411 ts->ts_free = ots; 412 l->l_ts = ts; 413 414 KASSERT(ts->ts_obj == obj); 415 KASSERT(TS_ALL_WAITERS(ts) != 0); 416 KASSERT(!LIST_EMPTY(&ts->ts_sleepq[TS_READER_Q]) || 417 !LIST_EMPTY(&ts->ts_sleepq[TS_WRITER_Q])); 418 } 419 420 sq = &ts->ts_sleepq[q]; 421 ts->ts_waiters[q]++; 422 sleepq_enter(sq, l, lock); 423 LOCKDEBUG_BARRIER(lock, 1); 424 l->l_kpriority = true; 425 obase = l->l_kpribase; 426 if (obase < PRI_KTHREAD) 427 l->l_kpribase = PRI_KTHREAD; 428 sleepq_enqueue(sq, obj, "tstile", sobj, false); 429 430 /* 431 * Disable preemption across this entire block, as we may drop 432 * scheduler locks (allowing preemption), and would prefer not 433 * to be interrupted while in a state of flux. 434 */ 435 KPREEMPT_DISABLE(l); 436 KASSERT(lock == l->l_mutex); 437 turnstile_lendpri(l); 438 sleepq_block(0, false, sobj); 439 l->l_kpribase = obase; 440 KPREEMPT_ENABLE(l); 441 } 442 443 /* 444 * turnstile_wakeup: 445 * 446 * Wake up the specified number of threads that are blocked 447 * in a turnstile. 448 */ 449 void 450 turnstile_wakeup(turnstile_t *ts, int q, int count, lwp_t *nl) 451 { 452 sleepq_t *sq; 453 kmutex_t *lock; 454 u_int hash; 455 lwp_t *l; 456 457 hash = TS_HASH(ts->ts_obj); 458 lock = &turnstile_locks[hash].lock; 459 sq = &ts->ts_sleepq[q]; 460 461 KASSERT(q == TS_READER_Q || q == TS_WRITER_Q); 462 KASSERT(count > 0 && count <= TS_WAITERS(ts, q)); 463 KASSERT(mutex_owned(lock)); 464 KASSERT(ts->ts_inheritor == curlwp || ts->ts_inheritor == NULL); 465 466 /* 467 * restore inherited priority if necessary. 468 */ 469 470 if (ts->ts_inheritor != NULL) { 471 turnstile_unlendpri(ts); 472 } 473 474 if (nl != NULL) { 475 #if defined(DEBUG) || defined(LOCKDEBUG) 476 LIST_FOREACH(l, sq, l_sleepchain) { 477 if (l == nl) 478 break; 479 } 480 if (l == NULL) 481 panic("turnstile_wakeup: nl not on sleepq"); 482 #endif 483 turnstile_remove(ts, nl, q); 484 } else { 485 while (count-- > 0) { 486 l = LIST_FIRST(sq); 487 KASSERT(l != NULL); 488 turnstile_remove(ts, l, q); 489 } 490 } 491 mutex_spin_exit(lock); 492 } 493 494 /* 495 * turnstile_unsleep: 496 * 497 * Remove an LWP from the turnstile. This is called when the LWP has 498 * not been awoken normally but instead interrupted: for example, if it 499 * has received a signal. It's not a valid action for turnstiles, 500 * since LWPs blocking on a turnstile are not interruptable. 501 */ 502 void 503 turnstile_unsleep(lwp_t *l, bool cleanup) 504 { 505 506 lwp_unlock(l); 507 panic("turnstile_unsleep"); 508 } 509 510 /* 511 * turnstile_changepri: 512 * 513 * Adjust the priority of an LWP residing on a turnstile. 514 */ 515 void 516 turnstile_changepri(lwp_t *l, pri_t pri) 517 { 518 519 /* XXX priority inheritance */ 520 sleepq_changepri(l, pri); 521 } 522 523 #if defined(LOCKDEBUG) 524 /* 525 * turnstile_print: 526 * 527 * Given the address of a lock object, print the contents of a 528 * turnstile. 529 */ 530 void 531 turnstile_print(volatile void *obj, void (*pr)(const char *, ...)) 532 { 533 turnstile_t *ts; 534 tschain_t *tc; 535 sleepq_t *rsq, *wsq; 536 u_int hash; 537 lwp_t *l; 538 539 hash = TS_HASH(obj); 540 tc = &turnstile_chains[hash]; 541 542 LIST_FOREACH(ts, tc, ts_chain) 543 if (ts->ts_obj == obj) 544 break; 545 546 if (ts == NULL) { 547 (*pr)("Turnstile: no active turnstile for this lock.\n"); 548 return; 549 } 550 551 rsq = &ts->ts_sleepq[TS_READER_Q]; 552 wsq = &ts->ts_sleepq[TS_WRITER_Q]; 553 554 (*pr)("Turnstile:\n"); 555 (*pr)("=> %d waiting readers:", TS_WAITERS(ts, TS_READER_Q)); 556 LIST_FOREACH(l, rsq, l_sleepchain) { 557 (*pr)(" %p", l); 558 } 559 (*pr)("\n"); 560 561 (*pr)("=> %d waiting writers:", TS_WAITERS(ts, TS_WRITER_Q)); 562 LIST_FOREACH(l, wsq, l_sleepchain) { 563 (*pr)(" %p", l); 564 } 565 (*pr)("\n"); 566 } 567 #endif /* LOCKDEBUG */ 568