1 /* $NetBSD: kern_turnstile.c,v 1.40 2020/05/23 20:45:10 ad Exp $ */ 2 3 /*- 4 * Copyright (c) 2002, 2006, 2007, 2009, 2019, 2020 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Jason R. Thorpe and Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Turnstiles are described in detail in: 35 * 36 * Solaris Internals: Core Kernel Architecture, Jim Mauro and 37 * Richard McDougall. 38 * 39 * Turnstiles are kept in a hash table. There are likely to be many more 40 * synchronisation objects than there are threads. Since a thread can block 41 * on only one lock at a time, we only need one turnstile per thread, and 42 * so they are allocated at thread creation time. 43 * 44 * When a thread decides it needs to block on a lock, it looks up the 45 * active turnstile for that lock. If no active turnstile exists, then 46 * the process lends its turnstile to the lock. If there is already an 47 * active turnstile for the lock, the thread places its turnstile on a 48 * list of free turnstiles, and references the active one instead. 49 * 50 * The act of looking up the turnstile acquires an interlock on the sleep 51 * queue. If a thread decides it doesn't need to block after all, then this 52 * interlock must be released by explicitly aborting the turnstile 53 * operation. 54 * 55 * When a thread is awakened, it needs to get its turnstile back. If there 56 * are still other threads waiting in the active turnstile, the thread 57 * grabs a free turnstile off the free list. Otherwise, it can take back 58 * the active turnstile from the lock (thus deactivating the turnstile). 59 * 60 * Turnstiles are where we do priority inheritence. 61 */ 62 63 #include <sys/cdefs.h> 64 __KERNEL_RCSID(0, "$NetBSD: kern_turnstile.c,v 1.40 2020/05/23 20:45:10 ad Exp $"); 65 66 #include <sys/param.h> 67 #include <sys/lockdebug.h> 68 #include <sys/pool.h> 69 #include <sys/proc.h> 70 #include <sys/sleepq.h> 71 #include <sys/systm.h> 72 73 /* 74 * Shift of 6 aligns to typical cache line size of 64 bytes; there's no 75 * point having two turnstile locks to back two lock objects that share one 76 * cache line. 77 */ 78 #define TS_HASH_SIZE 128 79 #define TS_HASH_MASK (TS_HASH_SIZE - 1) 80 #define TS_HASH(obj) (((uintptr_t)(obj) >> 6) & TS_HASH_MASK) 81 82 static tschain_t turnstile_chains[TS_HASH_SIZE] __cacheline_aligned; 83 struct pool turnstile_pool; 84 extern turnstile_t turnstile0; 85 86 static union { 87 kmutex_t lock; 88 uint8_t pad[COHERENCY_UNIT]; 89 } turnstile_locks[TS_HASH_SIZE] __cacheline_aligned; 90 91 /* 92 * turnstile_init: 93 * 94 * Initialize the turnstile mechanism. 95 */ 96 void 97 turnstile_init(void) 98 { 99 int i; 100 101 for (i = 0; i < TS_HASH_SIZE; i++) { 102 LIST_INIT(&turnstile_chains[i]); 103 mutex_init(&turnstile_locks[i].lock, MUTEX_DEFAULT, IPL_SCHED); 104 } 105 106 pool_init(&turnstile_pool, sizeof(turnstile_t), coherency_unit, 107 0, 0, "tstile", NULL, IPL_NONE); 108 109 turnstile_ctor(&turnstile0); 110 } 111 112 /* 113 * turnstile_ctor: 114 * 115 * Constructor for turnstiles. 116 */ 117 void 118 turnstile_ctor(turnstile_t *ts) 119 { 120 121 memset(ts, 0, sizeof(*ts)); 122 sleepq_init(&ts->ts_sleepq[TS_READER_Q]); 123 sleepq_init(&ts->ts_sleepq[TS_WRITER_Q]); 124 } 125 126 /* 127 * turnstile_remove: 128 * 129 * Remove an LWP from a turnstile sleep queue and wake it. 130 */ 131 static inline void 132 turnstile_remove(turnstile_t *ts, lwp_t *l, int q) 133 { 134 turnstile_t *nts; 135 136 KASSERT(l->l_ts == ts); 137 138 /* 139 * This process is no longer using the active turnstile. 140 * Find an inactive one on the free list to give to it. 141 */ 142 if ((nts = ts->ts_free) != NULL) { 143 KASSERT(TS_ALL_WAITERS(ts) > 1); 144 l->l_ts = nts; 145 ts->ts_free = nts->ts_free; 146 nts->ts_free = NULL; 147 } else { 148 /* 149 * If the free list is empty, this is the last 150 * waiter. 151 */ 152 KASSERT(TS_ALL_WAITERS(ts) == 1); 153 LIST_REMOVE(ts, ts_chain); 154 } 155 156 ts->ts_waiters[q]--; 157 sleepq_remove(&ts->ts_sleepq[q], l); 158 } 159 160 /* 161 * turnstile_lookup: 162 * 163 * Look up the turnstile for the specified lock. This acquires and 164 * holds the turnstile chain lock (sleep queue interlock). 165 */ 166 turnstile_t * 167 turnstile_lookup(wchan_t obj) 168 { 169 turnstile_t *ts; 170 tschain_t *tc; 171 u_int hash; 172 173 hash = TS_HASH(obj); 174 tc = &turnstile_chains[hash]; 175 mutex_spin_enter(&turnstile_locks[hash].lock); 176 177 LIST_FOREACH(ts, tc, ts_chain) 178 if (ts->ts_obj == obj) 179 return (ts); 180 181 /* 182 * No turnstile yet for this lock. No problem, turnstile_block() 183 * handles this by fetching the turnstile from the blocking thread. 184 */ 185 return (NULL); 186 } 187 188 /* 189 * turnstile_exit: 190 * 191 * Abort a turnstile operation. 192 */ 193 void 194 turnstile_exit(wchan_t obj) 195 { 196 197 mutex_spin_exit(&turnstile_locks[TS_HASH(obj)].lock); 198 } 199 200 /* 201 * turnstile_lendpri: 202 * 203 * Lend our priority to lwps on the blocking chain. 204 * 205 * If the current owner of the lock (l->l_wchan, set by sleepq_enqueue) 206 * has a priority lower than ours (lwp_eprio(l)), lend our priority to 207 * him to avoid priority inversions. 208 */ 209 210 static void 211 turnstile_lendpri(lwp_t *cur) 212 { 213 lwp_t * l = cur; 214 pri_t prio; 215 216 /* 217 * NOTE: if you get a panic in this code block, it is likely that 218 * a lock has been destroyed or corrupted while still in use. Try 219 * compiling a kernel with LOCKDEBUG to pinpoint the problem. 220 */ 221 222 LOCKDEBUG_BARRIER(l->l_mutex, 1); 223 KASSERT(l == curlwp); 224 prio = lwp_eprio(l); 225 for (;;) { 226 lwp_t *owner; 227 turnstile_t *ts; 228 bool dolock; 229 230 if (l->l_wchan == NULL) 231 break; 232 233 /* 234 * Ask syncobj the owner of the lock. 235 */ 236 owner = (*l->l_syncobj->sobj_owner)(l->l_wchan); 237 if (owner == NULL) 238 break; 239 240 /* 241 * The owner may have changed as we have dropped the tc lock. 242 */ 243 if (cur == owner) { 244 /* 245 * We own the lock: stop here, sleepq_block() 246 * should wake up immediatly. 247 */ 248 break; 249 } 250 /* 251 * Acquire owner->l_mutex if we don't have it yet. 252 * Because we already have another LWP lock (l->l_mutex) held, 253 * we need to play a try lock dance to avoid deadlock. 254 */ 255 dolock = l->l_mutex != owner->l_mutex; 256 if (l == owner || (dolock && !lwp_trylock(owner))) { 257 /* 258 * The owner was changed behind us or trylock failed. 259 * Restart from curlwp. 260 * 261 * Note that there may be a livelock here: 262 * the owner may try grabing cur's lock (which is the 263 * tc lock) while we're trying to grab the owner's lock. 264 */ 265 lwp_unlock(l); 266 l = cur; 267 lwp_lock(l); 268 prio = lwp_eprio(l); 269 continue; 270 } 271 /* 272 * If the owner's priority is already higher than ours, 273 * there's nothing to do anymore. 274 */ 275 if (prio <= lwp_eprio(owner)) { 276 if (dolock) 277 lwp_unlock(owner); 278 break; 279 } 280 /* 281 * Lend our priority to the 'owner' LWP. 282 * 283 * Update lenders info for turnstile_unlendpri. 284 */ 285 ts = l->l_ts; 286 KASSERT(ts->ts_inheritor == owner || ts->ts_inheritor == NULL); 287 if (ts->ts_inheritor == NULL) { 288 ts->ts_inheritor = owner; 289 ts->ts_eprio = prio; 290 SLIST_INSERT_HEAD(&owner->l_pi_lenders, ts, ts_pichain); 291 lwp_lendpri(owner, prio); 292 } else if (prio > ts->ts_eprio) { 293 ts->ts_eprio = prio; 294 lwp_lendpri(owner, prio); 295 } 296 if (dolock) 297 lwp_unlock(l); 298 LOCKDEBUG_BARRIER(owner->l_mutex, 1); 299 l = owner; 300 } 301 LOCKDEBUG_BARRIER(l->l_mutex, 1); 302 if (cur->l_mutex != l->l_mutex) { 303 lwp_unlock(l); 304 lwp_lock(cur); 305 } 306 LOCKDEBUG_BARRIER(cur->l_mutex, 1); 307 } 308 309 /* 310 * turnstile_unlendpri: undo turnstile_lendpri 311 */ 312 313 static void 314 turnstile_unlendpri(turnstile_t *ts) 315 { 316 lwp_t * const l = curlwp; 317 turnstile_t *iter; 318 turnstile_t *next; 319 turnstile_t *prev = NULL; 320 pri_t prio; 321 bool dolock; 322 323 KASSERT(ts->ts_inheritor != NULL); 324 ts->ts_inheritor = NULL; 325 dolock = l->l_mutex == l->l_cpu->ci_schedstate.spc_lwplock; 326 if (dolock) { 327 lwp_lock(l); 328 } 329 330 /* 331 * the following loop does two things. 332 * 333 * - remove ts from the list. 334 * 335 * - from the rest of the list, find the highest priority. 336 */ 337 338 prio = -1; 339 KASSERT(!SLIST_EMPTY(&l->l_pi_lenders)); 340 for (iter = SLIST_FIRST(&l->l_pi_lenders); 341 iter != NULL; iter = next) { 342 KASSERT(lwp_eprio(l) >= ts->ts_eprio); 343 next = SLIST_NEXT(iter, ts_pichain); 344 if (iter == ts) { 345 if (prev == NULL) { 346 SLIST_REMOVE_HEAD(&l->l_pi_lenders, 347 ts_pichain); 348 } else { 349 SLIST_REMOVE_AFTER(prev, ts_pichain); 350 } 351 } else if (prio < iter->ts_eprio) { 352 prio = iter->ts_eprio; 353 } 354 prev = iter; 355 } 356 357 lwp_lendpri(l, prio); 358 359 if (dolock) { 360 lwp_unlock(l); 361 } 362 } 363 364 /* 365 * turnstile_block: 366 * 367 * Enter an object into the turnstile chain and prepare the current 368 * LWP for sleep. 369 */ 370 void 371 turnstile_block(turnstile_t *ts, int q, wchan_t obj, syncobj_t *sobj) 372 { 373 lwp_t * const l = curlwp; /* cached curlwp */ 374 turnstile_t *ots; 375 tschain_t *tc; 376 kmutex_t *lock; 377 sleepq_t *sq; 378 pri_t obase; 379 u_int hash; 380 381 hash = TS_HASH(obj); 382 tc = &turnstile_chains[hash]; 383 lock = &turnstile_locks[hash].lock; 384 385 KASSERT(q == TS_READER_Q || q == TS_WRITER_Q); 386 KASSERT(mutex_owned(lock)); 387 KASSERT(l != NULL && l->l_ts != NULL); 388 389 if (ts == NULL) { 390 /* 391 * We are the first thread to wait for this object; 392 * lend our turnstile to it. 393 */ 394 ts = l->l_ts; 395 KASSERT(TS_ALL_WAITERS(ts) == 0); 396 KASSERT(LIST_EMPTY(&ts->ts_sleepq[TS_READER_Q]) && 397 LIST_EMPTY(&ts->ts_sleepq[TS_WRITER_Q])); 398 ts->ts_obj = obj; 399 ts->ts_inheritor = NULL; 400 LIST_INSERT_HEAD(tc, ts, ts_chain); 401 } else { 402 /* 403 * Object already has a turnstile. Put our turnstile 404 * onto the free list, and reference the existing 405 * turnstile instead. 406 */ 407 ots = l->l_ts; 408 KASSERT(ots->ts_free == NULL); 409 ots->ts_free = ts->ts_free; 410 ts->ts_free = ots; 411 l->l_ts = ts; 412 413 KASSERT(ts->ts_obj == obj); 414 KASSERT(TS_ALL_WAITERS(ts) != 0); 415 KASSERT(!LIST_EMPTY(&ts->ts_sleepq[TS_READER_Q]) || 416 !LIST_EMPTY(&ts->ts_sleepq[TS_WRITER_Q])); 417 } 418 419 sq = &ts->ts_sleepq[q]; 420 ts->ts_waiters[q]++; 421 sleepq_enter(sq, l, lock); 422 LOCKDEBUG_BARRIER(lock, 1); 423 l->l_kpriority = true; 424 obase = l->l_kpribase; 425 if (obase < PRI_KTHREAD) 426 l->l_kpribase = PRI_KTHREAD; 427 sleepq_enqueue(sq, obj, "tstile", sobj, false); 428 429 /* 430 * Disable preemption across this entire block, as we may drop 431 * scheduler locks (allowing preemption), and would prefer not 432 * to be interrupted while in a state of flux. 433 */ 434 KPREEMPT_DISABLE(l); 435 KASSERT(lock == l->l_mutex); 436 turnstile_lendpri(l); 437 sleepq_block(0, false); 438 l->l_kpribase = obase; 439 KPREEMPT_ENABLE(l); 440 } 441 442 /* 443 * turnstile_wakeup: 444 * 445 * Wake up the specified number of threads that are blocked 446 * in a turnstile. 447 */ 448 void 449 turnstile_wakeup(turnstile_t *ts, int q, int count, lwp_t *nl) 450 { 451 sleepq_t *sq; 452 kmutex_t *lock; 453 u_int hash; 454 lwp_t *l; 455 456 hash = TS_HASH(ts->ts_obj); 457 lock = &turnstile_locks[hash].lock; 458 sq = &ts->ts_sleepq[q]; 459 460 KASSERT(q == TS_READER_Q || q == TS_WRITER_Q); 461 KASSERT(count > 0 && count <= TS_WAITERS(ts, q)); 462 KASSERT(mutex_owned(lock)); 463 KASSERT(ts->ts_inheritor == curlwp || ts->ts_inheritor == NULL); 464 465 /* 466 * restore inherited priority if necessary. 467 */ 468 469 if (ts->ts_inheritor != NULL) { 470 turnstile_unlendpri(ts); 471 } 472 473 if (nl != NULL) { 474 #if defined(DEBUG) || defined(LOCKDEBUG) 475 LIST_FOREACH(l, sq, l_sleepchain) { 476 if (l == nl) 477 break; 478 } 479 if (l == NULL) 480 panic("turnstile_wakeup: nl not on sleepq"); 481 #endif 482 turnstile_remove(ts, nl, q); 483 } else { 484 while (count-- > 0) { 485 l = LIST_FIRST(sq); 486 KASSERT(l != NULL); 487 turnstile_remove(ts, l, q); 488 } 489 } 490 mutex_spin_exit(lock); 491 } 492 493 /* 494 * turnstile_unsleep: 495 * 496 * Remove an LWP from the turnstile. This is called when the LWP has 497 * not been awoken normally but instead interrupted: for example, if it 498 * has received a signal. It's not a valid action for turnstiles, 499 * since LWPs blocking on a turnstile are not interruptable. 500 */ 501 void 502 turnstile_unsleep(lwp_t *l, bool cleanup) 503 { 504 505 lwp_unlock(l); 506 panic("turnstile_unsleep"); 507 } 508 509 /* 510 * turnstile_changepri: 511 * 512 * Adjust the priority of an LWP residing on a turnstile. 513 */ 514 void 515 turnstile_changepri(lwp_t *l, pri_t pri) 516 { 517 518 /* XXX priority inheritance */ 519 sleepq_changepri(l, pri); 520 } 521 522 #if defined(LOCKDEBUG) 523 /* 524 * turnstile_print: 525 * 526 * Given the address of a lock object, print the contents of a 527 * turnstile. 528 */ 529 void 530 turnstile_print(volatile void *obj, void (*pr)(const char *, ...)) 531 { 532 turnstile_t *ts; 533 tschain_t *tc; 534 sleepq_t *rsq, *wsq; 535 u_int hash; 536 lwp_t *l; 537 538 hash = TS_HASH(obj); 539 tc = &turnstile_chains[hash]; 540 541 LIST_FOREACH(ts, tc, ts_chain) 542 if (ts->ts_obj == obj) 543 break; 544 545 if (ts == NULL) { 546 (*pr)("Turnstile: no active turnstile for this lock.\n"); 547 return; 548 } 549 550 rsq = &ts->ts_sleepq[TS_READER_Q]; 551 wsq = &ts->ts_sleepq[TS_WRITER_Q]; 552 553 (*pr)("Turnstile:\n"); 554 (*pr)("=> %d waiting readers:", TS_WAITERS(ts, TS_READER_Q)); 555 LIST_FOREACH(l, rsq, l_sleepchain) { 556 (*pr)(" %p", l); 557 } 558 (*pr)("\n"); 559 560 (*pr)("=> %d waiting writers:", TS_WAITERS(ts, TS_WRITER_Q)); 561 LIST_FOREACH(l, wsq, l_sleepchain) { 562 (*pr)(" %p", l); 563 } 564 (*pr)("\n"); 565 } 566 #endif /* LOCKDEBUG */ 567