1 /* $NetBSD: kern_turnstile.c,v 1.46 2023/04/09 09:18:09 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2002, 2006, 2007, 2009, 2019, 2020 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Jason R. Thorpe and Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Turnstiles are described in detail in: 35 * 36 * Solaris Internals: Core Kernel Architecture, Jim Mauro and 37 * Richard McDougall. 38 * 39 * Turnstiles are kept in a hash table. There are likely to be many more 40 * synchronisation objects than there are threads. Since a thread can block 41 * on only one lock at a time, we only need one turnstile per thread, and 42 * so they are allocated at thread creation time. 43 * 44 * When a thread decides it needs to block on a lock, it looks up the 45 * active turnstile for that lock. If no active turnstile exists, then 46 * the process lends its turnstile to the lock. If there is already an 47 * active turnstile for the lock, the thread places its turnstile on a 48 * list of free turnstiles, and references the active one instead. 49 * 50 * The act of looking up the turnstile acquires an interlock on the sleep 51 * queue. If a thread decides it doesn't need to block after all, then this 52 * interlock must be released by explicitly aborting the turnstile 53 * operation. 54 * 55 * When a thread is awakened, it needs to get its turnstile back. If there 56 * are still other threads waiting in the active turnstile, the thread 57 * grabs a free turnstile off the free list. Otherwise, it can take back 58 * the active turnstile from the lock (thus deactivating the turnstile). 59 * 60 * Turnstiles are where we do priority inheritence. 61 */ 62 63 #include <sys/cdefs.h> 64 __KERNEL_RCSID(0, "$NetBSD: kern_turnstile.c,v 1.46 2023/04/09 09:18:09 riastradh Exp $"); 65 66 #include <sys/param.h> 67 #include <sys/lockdebug.h> 68 #include <sys/pool.h> 69 #include <sys/proc.h> 70 #include <sys/sleepq.h> 71 #include <sys/sleeptab.h> 72 #include <sys/systm.h> 73 74 /* 75 * Shift of 6 aligns to typical cache line size of 64 bytes; there's no 76 * point having two turnstile locks to back two lock objects that share one 77 * cache line. 78 */ 79 #define TS_HASH_SIZE 128 80 #define TS_HASH_MASK (TS_HASH_SIZE - 1) 81 #define TS_HASH(obj) (((uintptr_t)(obj) >> 6) & TS_HASH_MASK) 82 83 static tschain_t turnstile_chains[TS_HASH_SIZE] __cacheline_aligned; 84 struct pool turnstile_pool; 85 86 static union { 87 kmutex_t lock; 88 uint8_t pad[COHERENCY_UNIT]; 89 } turnstile_locks[TS_HASH_SIZE] __cacheline_aligned; 90 91 /* 92 * turnstile_init: 93 * 94 * Initialize the turnstile mechanism. 95 */ 96 void 97 turnstile_init(void) 98 { 99 int i; 100 101 for (i = 0; i < TS_HASH_SIZE; i++) { 102 LIST_INIT(&turnstile_chains[i]); 103 mutex_init(&turnstile_locks[i].lock, MUTEX_DEFAULT, IPL_SCHED); 104 } 105 106 pool_init(&turnstile_pool, sizeof(turnstile_t), coherency_unit, 107 0, 0, "tstile", NULL, IPL_NONE); 108 109 turnstile_ctor(&turnstile0); 110 } 111 112 /* 113 * turnstile_ctor: 114 * 115 * Constructor for turnstiles. 116 */ 117 void 118 turnstile_ctor(turnstile_t *ts) 119 { 120 121 memset(ts, 0, sizeof(*ts)); 122 sleepq_init(&ts->ts_sleepq[TS_READER_Q]); 123 sleepq_init(&ts->ts_sleepq[TS_WRITER_Q]); 124 } 125 126 /* 127 * turnstile_remove: 128 * 129 * Remove an LWP from a turnstile sleep queue and wake it. 130 */ 131 static inline void 132 turnstile_remove(turnstile_t *ts, lwp_t *l, int q) 133 { 134 turnstile_t *nts; 135 136 KASSERT(l->l_ts == ts); 137 138 /* 139 * This process is no longer using the active turnstile. 140 * Find an inactive one on the free list to give to it. 141 */ 142 if ((nts = ts->ts_free) != NULL) { 143 KASSERT(TS_ALL_WAITERS(ts) > 1); 144 l->l_ts = nts; 145 ts->ts_free = nts->ts_free; 146 nts->ts_free = NULL; 147 } else { 148 /* 149 * If the free list is empty, this is the last 150 * waiter. 151 */ 152 KASSERT(TS_ALL_WAITERS(ts) == 1); 153 LIST_REMOVE(ts, ts_chain); 154 } 155 156 ts->ts_waiters[q]--; 157 sleepq_remove(&ts->ts_sleepq[q], l); 158 } 159 160 /* 161 * turnstile_lookup: 162 * 163 * Look up the turnstile for the specified lock. This acquires and 164 * holds the turnstile chain lock (sleep queue interlock). 165 */ 166 turnstile_t * 167 turnstile_lookup(wchan_t obj) 168 { 169 turnstile_t *ts; 170 tschain_t *tc; 171 u_int hash; 172 173 hash = TS_HASH(obj); 174 tc = &turnstile_chains[hash]; 175 mutex_spin_enter(&turnstile_locks[hash].lock); 176 177 LIST_FOREACH(ts, tc, ts_chain) 178 if (ts->ts_obj == obj) 179 return (ts); 180 181 /* 182 * No turnstile yet for this lock. No problem, turnstile_block() 183 * handles this by fetching the turnstile from the blocking thread. 184 */ 185 return (NULL); 186 } 187 188 /* 189 * turnstile_exit: 190 * 191 * Abort a turnstile operation. 192 */ 193 void 194 turnstile_exit(wchan_t obj) 195 { 196 197 mutex_spin_exit(&turnstile_locks[TS_HASH(obj)].lock); 198 } 199 200 /* 201 * turnstile_lendpri: 202 * 203 * Lend our priority to lwps on the blocking chain. 204 * 205 * If the current owner of the lock (l->l_wchan, set by sleepq_enqueue) 206 * has a priority lower than ours (lwp_eprio(l)), lend our priority to 207 * him to avoid priority inversions. 208 */ 209 210 static void 211 turnstile_lendpri(lwp_t *cur) 212 { 213 lwp_t * l = cur; 214 pri_t prio; 215 216 /* 217 * NOTE: if you get a panic in this code block, it is likely that 218 * a lock has been destroyed or corrupted while still in use. Try 219 * compiling a kernel with LOCKDEBUG to pinpoint the problem. 220 */ 221 222 LOCKDEBUG_BARRIER(l->l_mutex, 1); 223 KASSERT(l == curlwp); 224 prio = lwp_eprio(l); 225 for (;;) { 226 lwp_t *owner; 227 turnstile_t *ts; 228 bool dolock; 229 230 if (l->l_wchan == NULL) 231 break; 232 233 /* 234 * Ask syncobj the owner of the lock. 235 */ 236 owner = (*l->l_syncobj->sobj_owner)(l->l_wchan); 237 if (owner == NULL) 238 break; 239 240 /* 241 * The owner may have changed as we have dropped the tc lock. 242 */ 243 if (cur == owner) { 244 /* 245 * We own the lock: stop here, sleepq_block() 246 * should wake up immediately. 247 */ 248 break; 249 } 250 /* 251 * Acquire owner->l_mutex if we don't have it yet. 252 * Because we already have another LWP lock (l->l_mutex) held, 253 * we need to play a try lock dance to avoid deadlock. 254 */ 255 dolock = l->l_mutex != atomic_load_relaxed(&owner->l_mutex); 256 if (l == owner || (dolock && !lwp_trylock(owner))) { 257 /* 258 * The owner was changed behind us or trylock failed. 259 * Restart from curlwp. 260 * 261 * Note that there may be a livelock here: 262 * the owner may try grabbing cur's lock (which is the 263 * tc lock) while we're trying to grab the owner's lock. 264 */ 265 lwp_unlock(l); 266 l = cur; 267 lwp_lock(l); 268 prio = lwp_eprio(l); 269 continue; 270 } 271 /* 272 * If the owner's priority is already higher than ours, 273 * there's nothing to do anymore. 274 */ 275 if (prio <= lwp_eprio(owner)) { 276 if (dolock) 277 lwp_unlock(owner); 278 break; 279 } 280 /* 281 * Lend our priority to the 'owner' LWP. 282 * 283 * Update lenders info for turnstile_unlendpri. 284 */ 285 ts = l->l_ts; 286 KASSERT(ts->ts_inheritor == owner || ts->ts_inheritor == NULL); 287 if (ts->ts_inheritor == NULL) { 288 ts->ts_inheritor = owner; 289 ts->ts_eprio = prio; 290 SLIST_INSERT_HEAD(&owner->l_pi_lenders, ts, ts_pichain); 291 lwp_lendpri(owner, prio); 292 } else if (prio > ts->ts_eprio) { 293 ts->ts_eprio = prio; 294 lwp_lendpri(owner, prio); 295 } 296 if (dolock) 297 lwp_unlock(l); 298 LOCKDEBUG_BARRIER(owner->l_mutex, 1); 299 l = owner; 300 } 301 LOCKDEBUG_BARRIER(l->l_mutex, 1); 302 if (cur->l_mutex != atomic_load_relaxed(&l->l_mutex)) { 303 lwp_unlock(l); 304 lwp_lock(cur); 305 } 306 LOCKDEBUG_BARRIER(cur->l_mutex, 1); 307 } 308 309 /* 310 * turnstile_unlendpri: undo turnstile_lendpri 311 */ 312 313 static void 314 turnstile_unlendpri(turnstile_t *ts) 315 { 316 lwp_t * const l = curlwp; 317 turnstile_t *iter; 318 turnstile_t *next; 319 turnstile_t *prev = NULL; 320 pri_t prio; 321 bool dolock; 322 323 KASSERT(ts->ts_inheritor != NULL); 324 ts->ts_inheritor = NULL; 325 dolock = (atomic_load_relaxed(&l->l_mutex) == 326 l->l_cpu->ci_schedstate.spc_lwplock); 327 if (dolock) { 328 lwp_lock(l); 329 } 330 331 /* 332 * the following loop does two things. 333 * 334 * - remove ts from the list. 335 * 336 * - from the rest of the list, find the highest priority. 337 */ 338 339 prio = -1; 340 KASSERT(!SLIST_EMPTY(&l->l_pi_lenders)); 341 for (iter = SLIST_FIRST(&l->l_pi_lenders); 342 iter != NULL; iter = next) { 343 KASSERT(lwp_eprio(l) >= ts->ts_eprio); 344 next = SLIST_NEXT(iter, ts_pichain); 345 if (iter == ts) { 346 if (prev == NULL) { 347 SLIST_REMOVE_HEAD(&l->l_pi_lenders, 348 ts_pichain); 349 } else { 350 SLIST_REMOVE_AFTER(prev, ts_pichain); 351 } 352 } else if (prio < iter->ts_eprio) { 353 prio = iter->ts_eprio; 354 } 355 prev = iter; 356 } 357 358 lwp_lendpri(l, prio); 359 360 if (dolock) { 361 lwp_unlock(l); 362 } 363 } 364 365 /* 366 * turnstile_block: 367 * 368 * Enter an object into the turnstile chain and prepare the current 369 * LWP for sleep. 370 */ 371 void 372 turnstile_block(turnstile_t *ts, int q, wchan_t obj, syncobj_t *sobj) 373 { 374 lwp_t * const l = curlwp; /* cached curlwp */ 375 turnstile_t *ots; 376 tschain_t *tc; 377 kmutex_t *lock; 378 sleepq_t *sq; 379 pri_t obase; 380 u_int hash; 381 382 hash = TS_HASH(obj); 383 tc = &turnstile_chains[hash]; 384 lock = &turnstile_locks[hash].lock; 385 386 KASSERT(q == TS_READER_Q || q == TS_WRITER_Q); 387 KASSERT(mutex_owned(lock)); 388 KASSERT(l != NULL); 389 KASSERT(l->l_ts != NULL); 390 391 if (ts == NULL) { 392 /* 393 * We are the first thread to wait for this object; 394 * lend our turnstile to it. 395 */ 396 ts = l->l_ts; 397 KASSERT(TS_ALL_WAITERS(ts) == 0); 398 KASSERT(LIST_EMPTY(&ts->ts_sleepq[TS_READER_Q])); 399 KASSERT(LIST_EMPTY(&ts->ts_sleepq[TS_WRITER_Q])); 400 ts->ts_obj = obj; 401 ts->ts_inheritor = NULL; 402 LIST_INSERT_HEAD(tc, ts, ts_chain); 403 } else { 404 /* 405 * Object already has a turnstile. Put our turnstile 406 * onto the free list, and reference the existing 407 * turnstile instead. 408 */ 409 ots = l->l_ts; 410 KASSERT(ots->ts_free == NULL); 411 ots->ts_free = ts->ts_free; 412 ts->ts_free = ots; 413 l->l_ts = ts; 414 415 KASSERT(ts->ts_obj == obj); 416 KASSERT(TS_ALL_WAITERS(ts) != 0); 417 KASSERT(!LIST_EMPTY(&ts->ts_sleepq[TS_READER_Q]) || 418 !LIST_EMPTY(&ts->ts_sleepq[TS_WRITER_Q])); 419 } 420 421 sq = &ts->ts_sleepq[q]; 422 ts->ts_waiters[q]++; 423 sleepq_enter(sq, l, lock); 424 LOCKDEBUG_BARRIER(lock, 1); 425 l->l_kpriority = true; 426 obase = l->l_kpribase; 427 if (obase < PRI_KTHREAD) 428 l->l_kpribase = PRI_KTHREAD; 429 sleepq_enqueue(sq, obj, "tstile", sobj, false); 430 431 /* 432 * Disable preemption across this entire block, as we may drop 433 * scheduler locks (allowing preemption), and would prefer not 434 * to be interrupted while in a state of flux. 435 */ 436 KPREEMPT_DISABLE(l); 437 KASSERT(lock == l->l_mutex); 438 turnstile_lendpri(l); 439 sleepq_block(0, false, sobj); 440 l->l_kpribase = obase; 441 KPREEMPT_ENABLE(l); 442 } 443 444 /* 445 * turnstile_wakeup: 446 * 447 * Wake up the specified number of threads that are blocked 448 * in a turnstile. 449 */ 450 void 451 turnstile_wakeup(turnstile_t *ts, int q, int count, lwp_t *nl) 452 { 453 sleepq_t *sq; 454 kmutex_t *lock; 455 u_int hash; 456 lwp_t *l; 457 458 hash = TS_HASH(ts->ts_obj); 459 lock = &turnstile_locks[hash].lock; 460 sq = &ts->ts_sleepq[q]; 461 462 KASSERT(q == TS_READER_Q || q == TS_WRITER_Q); 463 KASSERT(count > 0); 464 KASSERT(count <= TS_WAITERS(ts, q)); 465 KASSERT(mutex_owned(lock)); 466 KASSERT(ts->ts_inheritor == curlwp || ts->ts_inheritor == NULL); 467 468 /* 469 * restore inherited priority if necessary. 470 */ 471 472 if (ts->ts_inheritor != NULL) { 473 turnstile_unlendpri(ts); 474 } 475 476 if (nl != NULL) { 477 #if defined(DEBUG) || defined(LOCKDEBUG) 478 LIST_FOREACH(l, sq, l_sleepchain) { 479 if (l == nl) 480 break; 481 } 482 if (l == NULL) 483 panic("turnstile_wakeup: nl not on sleepq"); 484 #endif 485 turnstile_remove(ts, nl, q); 486 } else { 487 while (count-- > 0) { 488 l = LIST_FIRST(sq); 489 KASSERT(l != NULL); 490 turnstile_remove(ts, l, q); 491 } 492 } 493 mutex_spin_exit(lock); 494 } 495 496 /* 497 * turnstile_unsleep: 498 * 499 * Remove an LWP from the turnstile. This is called when the LWP has 500 * not been awoken normally but instead interrupted: for example, if it 501 * has received a signal. It's not a valid action for turnstiles, 502 * since LWPs blocking on a turnstile are not interruptable. 503 */ 504 void 505 turnstile_unsleep(lwp_t *l, bool cleanup) 506 { 507 508 lwp_unlock(l); 509 panic("turnstile_unsleep"); 510 } 511 512 /* 513 * turnstile_changepri: 514 * 515 * Adjust the priority of an LWP residing on a turnstile. 516 */ 517 void 518 turnstile_changepri(lwp_t *l, pri_t pri) 519 { 520 521 /* XXX priority inheritance */ 522 sleepq_changepri(l, pri); 523 } 524 525 #if defined(LOCKDEBUG) 526 /* 527 * turnstile_print: 528 * 529 * Given the address of a lock object, print the contents of a 530 * turnstile. 531 */ 532 void 533 turnstile_print(volatile void *obj, void (*pr)(const char *, ...)) 534 { 535 turnstile_t *ts; 536 tschain_t *tc; 537 sleepq_t *rsq, *wsq; 538 u_int hash; 539 lwp_t *l; 540 541 hash = TS_HASH(obj); 542 tc = &turnstile_chains[hash]; 543 544 LIST_FOREACH(ts, tc, ts_chain) 545 if (ts->ts_obj == obj) 546 break; 547 548 if (ts == NULL) { 549 (*pr)("Turnstile: no active turnstile for this lock.\n"); 550 return; 551 } 552 553 rsq = &ts->ts_sleepq[TS_READER_Q]; 554 wsq = &ts->ts_sleepq[TS_WRITER_Q]; 555 556 (*pr)("Turnstile:\n"); 557 (*pr)("=> %d waiting readers:", TS_WAITERS(ts, TS_READER_Q)); 558 LIST_FOREACH(l, rsq, l_sleepchain) { 559 (*pr)(" %p", l); 560 } 561 (*pr)("\n"); 562 563 (*pr)("=> %d waiting writers:", TS_WAITERS(ts, TS_WRITER_Q)); 564 LIST_FOREACH(l, wsq, l_sleepchain) { 565 (*pr)(" %p", l); 566 } 567 (*pr)("\n"); 568 } 569 #endif /* LOCKDEBUG */ 570