1 /* $NetBSD: kern_turnstile.c,v 1.38 2020/03/26 22:43:19 ad Exp $ */ 2 3 /*- 4 * Copyright (c) 2002, 2006, 2007, 2009, 2019, 2020 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Jason R. Thorpe and Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Turnstiles are described in detail in: 35 * 36 * Solaris Internals: Core Kernel Architecture, Jim Mauro and 37 * Richard McDougall. 38 * 39 * Turnstiles are kept in a hash table. There are likely to be many more 40 * synchronisation objects than there are threads. Since a thread can block 41 * on only one lock at a time, we only need one turnstile per thread, and 42 * so they are allocated at thread creation time. 43 * 44 * When a thread decides it needs to block on a lock, it looks up the 45 * active turnstile for that lock. If no active turnstile exists, then 46 * the process lends its turnstile to the lock. If there is already an 47 * active turnstile for the lock, the thread places its turnstile on a 48 * list of free turnstiles, and references the active one instead. 49 * 50 * The act of looking up the turnstile acquires an interlock on the sleep 51 * queue. If a thread decides it doesn't need to block after all, then this 52 * interlock must be released by explicitly aborting the turnstile 53 * operation. 54 * 55 * When a thread is awakened, it needs to get its turnstile back. If there 56 * are still other threads waiting in the active turnstile, the thread 57 * grabs a free turnstile off the free list. Otherwise, it can take back 58 * the active turnstile from the lock (thus deactivating the turnstile). 59 * 60 * Turnstiles are where we do priority inheritence. 61 */ 62 63 #include <sys/cdefs.h> 64 __KERNEL_RCSID(0, "$NetBSD: kern_turnstile.c,v 1.38 2020/03/26 22:43:19 ad Exp $"); 65 66 #include <sys/param.h> 67 #include <sys/lockdebug.h> 68 #include <sys/pool.h> 69 #include <sys/proc.h> 70 #include <sys/sleepq.h> 71 #include <sys/systm.h> 72 73 /* 74 * Shift of 6 aligns to typical cache line size of 64 bytes; there's no 75 * point having two turnstile locks to back two lock objects that share one 76 * cache line. 77 */ 78 #define TS_HASH_SIZE 128 79 #define TS_HASH_MASK (TS_HASH_SIZE - 1) 80 #define TS_HASH(obj) (((uintptr_t)(obj) >> 6) & TS_HASH_MASK) 81 82 static tschain_t turnstile_chains[TS_HASH_SIZE] __cacheline_aligned; 83 pool_cache_t turnstile_cache __read_mostly; 84 extern turnstile_t turnstile0; 85 86 static union { 87 kmutex_t lock; 88 uint8_t pad[COHERENCY_UNIT]; 89 } turnstile_locks[TS_HASH_SIZE] __cacheline_aligned; 90 91 static int turnstile_ctor(void *, void *, int); 92 93 /* 94 * turnstile_init: 95 * 96 * Initialize the turnstile mechanism. 97 */ 98 void 99 turnstile_init(void) 100 { 101 int i; 102 103 for (i = 0; i < TS_HASH_SIZE; i++) { 104 LIST_INIT(&turnstile_chains[i]); 105 mutex_init(&turnstile_locks[i].lock, MUTEX_DEFAULT, IPL_SCHED); 106 } 107 108 turnstile_cache = pool_cache_init(sizeof(turnstile_t), coherency_unit, 109 0, 0, "tstile", NULL, IPL_NONE, turnstile_ctor, NULL, NULL); 110 KASSERT(turnstile_cache != NULL); 111 112 (void)turnstile_ctor(NULL, &turnstile0, 0); 113 } 114 115 /* 116 * turnstile_ctor: 117 * 118 * Constructor for turnstiles. 119 */ 120 static int 121 turnstile_ctor(void *arg, void *obj, int flags) 122 { 123 turnstile_t *ts = obj; 124 125 memset(ts, 0, sizeof(*ts)); 126 sleepq_init(&ts->ts_sleepq[TS_READER_Q]); 127 sleepq_init(&ts->ts_sleepq[TS_WRITER_Q]); 128 return (0); 129 } 130 131 /* 132 * turnstile_remove: 133 * 134 * Remove an LWP from a turnstile sleep queue and wake it. 135 */ 136 static inline void 137 turnstile_remove(turnstile_t *ts, lwp_t *l, int q) 138 { 139 turnstile_t *nts; 140 141 KASSERT(l->l_ts == ts); 142 143 /* 144 * This process is no longer using the active turnstile. 145 * Find an inactive one on the free list to give to it. 146 */ 147 if ((nts = ts->ts_free) != NULL) { 148 KASSERT(TS_ALL_WAITERS(ts) > 1); 149 l->l_ts = nts; 150 ts->ts_free = nts->ts_free; 151 nts->ts_free = NULL; 152 } else { 153 /* 154 * If the free list is empty, this is the last 155 * waiter. 156 */ 157 KASSERT(TS_ALL_WAITERS(ts) == 1); 158 LIST_REMOVE(ts, ts_chain); 159 } 160 161 ts->ts_waiters[q]--; 162 sleepq_remove(&ts->ts_sleepq[q], l); 163 } 164 165 /* 166 * turnstile_lookup: 167 * 168 * Look up the turnstile for the specified lock. This acquires and 169 * holds the turnstile chain lock (sleep queue interlock). 170 */ 171 turnstile_t * 172 turnstile_lookup(wchan_t obj) 173 { 174 turnstile_t *ts; 175 tschain_t *tc; 176 u_int hash; 177 178 hash = TS_HASH(obj); 179 tc = &turnstile_chains[hash]; 180 mutex_spin_enter(&turnstile_locks[hash].lock); 181 182 LIST_FOREACH(ts, tc, ts_chain) 183 if (ts->ts_obj == obj) 184 return (ts); 185 186 /* 187 * No turnstile yet for this lock. No problem, turnstile_block() 188 * handles this by fetching the turnstile from the blocking thread. 189 */ 190 return (NULL); 191 } 192 193 /* 194 * turnstile_exit: 195 * 196 * Abort a turnstile operation. 197 */ 198 void 199 turnstile_exit(wchan_t obj) 200 { 201 202 mutex_spin_exit(&turnstile_locks[TS_HASH(obj)].lock); 203 } 204 205 /* 206 * turnstile_lendpri: 207 * 208 * Lend our priority to lwps on the blocking chain. 209 * 210 * If the current owner of the lock (l->l_wchan, set by sleepq_enqueue) 211 * has a priority lower than ours (lwp_eprio(l)), lend our priority to 212 * him to avoid priority inversions. 213 */ 214 215 static void 216 turnstile_lendpri(lwp_t *cur) 217 { 218 lwp_t * l = cur; 219 pri_t prio; 220 221 /* 222 * NOTE: if you get a panic in this code block, it is likely that 223 * a lock has been destroyed or corrupted while still in use. Try 224 * compiling a kernel with LOCKDEBUG to pinpoint the problem. 225 */ 226 227 LOCKDEBUG_BARRIER(l->l_mutex, 1); 228 KASSERT(l == curlwp); 229 prio = lwp_eprio(l); 230 for (;;) { 231 lwp_t *owner; 232 turnstile_t *ts; 233 bool dolock; 234 235 if (l->l_wchan == NULL) 236 break; 237 238 /* 239 * Ask syncobj the owner of the lock. 240 */ 241 owner = (*l->l_syncobj->sobj_owner)(l->l_wchan); 242 if (owner == NULL) 243 break; 244 245 /* 246 * The owner may have changed as we have dropped the tc lock. 247 */ 248 if (cur == owner) { 249 /* 250 * We own the lock: stop here, sleepq_block() 251 * should wake up immediatly. 252 */ 253 break; 254 } 255 /* 256 * Acquire owner->l_mutex if we don't have it yet. 257 * Because we already have another LWP lock (l->l_mutex) held, 258 * we need to play a try lock dance to avoid deadlock. 259 */ 260 dolock = l->l_mutex != owner->l_mutex; 261 if (l == owner || (dolock && !lwp_trylock(owner))) { 262 /* 263 * The owner was changed behind us or trylock failed. 264 * Restart from curlwp. 265 * 266 * Note that there may be a livelock here: 267 * the owner may try grabing cur's lock (which is the 268 * tc lock) while we're trying to grab the owner's lock. 269 */ 270 lwp_unlock(l); 271 l = cur; 272 lwp_lock(l); 273 prio = lwp_eprio(l); 274 continue; 275 } 276 /* 277 * If the owner's priority is already higher than ours, 278 * there's nothing to do anymore. 279 */ 280 if (prio <= lwp_eprio(owner)) { 281 if (dolock) 282 lwp_unlock(owner); 283 break; 284 } 285 /* 286 * Lend our priority to the 'owner' LWP. 287 * 288 * Update lenders info for turnstile_unlendpri. 289 */ 290 ts = l->l_ts; 291 KASSERT(ts->ts_inheritor == owner || ts->ts_inheritor == NULL); 292 if (ts->ts_inheritor == NULL) { 293 ts->ts_inheritor = owner; 294 ts->ts_eprio = prio; 295 SLIST_INSERT_HEAD(&owner->l_pi_lenders, ts, ts_pichain); 296 lwp_lendpri(owner, prio); 297 } else if (prio > ts->ts_eprio) { 298 ts->ts_eprio = prio; 299 lwp_lendpri(owner, prio); 300 } 301 if (dolock) 302 lwp_unlock(l); 303 LOCKDEBUG_BARRIER(owner->l_mutex, 1); 304 l = owner; 305 } 306 LOCKDEBUG_BARRIER(l->l_mutex, 1); 307 if (cur->l_mutex != l->l_mutex) { 308 lwp_unlock(l); 309 lwp_lock(cur); 310 } 311 LOCKDEBUG_BARRIER(cur->l_mutex, 1); 312 } 313 314 /* 315 * turnstile_unlendpri: undo turnstile_lendpri 316 */ 317 318 static void 319 turnstile_unlendpri(turnstile_t *ts) 320 { 321 lwp_t * const l = curlwp; 322 turnstile_t *iter; 323 turnstile_t *next; 324 turnstile_t *prev = NULL; 325 pri_t prio; 326 bool dolock; 327 328 KASSERT(ts->ts_inheritor != NULL); 329 ts->ts_inheritor = NULL; 330 dolock = l->l_mutex == l->l_cpu->ci_schedstate.spc_lwplock; 331 if (dolock) { 332 lwp_lock(l); 333 } 334 335 /* 336 * the following loop does two things. 337 * 338 * - remove ts from the list. 339 * 340 * - from the rest of the list, find the highest priority. 341 */ 342 343 prio = -1; 344 KASSERT(!SLIST_EMPTY(&l->l_pi_lenders)); 345 for (iter = SLIST_FIRST(&l->l_pi_lenders); 346 iter != NULL; iter = next) { 347 KASSERT(lwp_eprio(l) >= ts->ts_eprio); 348 next = SLIST_NEXT(iter, ts_pichain); 349 if (iter == ts) { 350 if (prev == NULL) { 351 SLIST_REMOVE_HEAD(&l->l_pi_lenders, 352 ts_pichain); 353 } else { 354 SLIST_REMOVE_AFTER(prev, ts_pichain); 355 } 356 } else if (prio < iter->ts_eprio) { 357 prio = iter->ts_eprio; 358 } 359 prev = iter; 360 } 361 362 lwp_lendpri(l, prio); 363 364 if (dolock) { 365 lwp_unlock(l); 366 } 367 } 368 369 /* 370 * turnstile_block: 371 * 372 * Enter an object into the turnstile chain and prepare the current 373 * LWP for sleep. 374 */ 375 void 376 turnstile_block(turnstile_t *ts, int q, wchan_t obj, syncobj_t *sobj) 377 { 378 lwp_t * const l = curlwp; /* cached curlwp */ 379 turnstile_t *ots; 380 tschain_t *tc; 381 kmutex_t *lock; 382 sleepq_t *sq; 383 pri_t obase; 384 u_int hash; 385 386 hash = TS_HASH(obj); 387 tc = &turnstile_chains[hash]; 388 lock = &turnstile_locks[hash].lock; 389 390 KASSERT(q == TS_READER_Q || q == TS_WRITER_Q); 391 KASSERT(mutex_owned(lock)); 392 KASSERT(l != NULL && l->l_ts != NULL); 393 394 if (ts == NULL) { 395 /* 396 * We are the first thread to wait for this object; 397 * lend our turnstile to it. 398 */ 399 ts = l->l_ts; 400 KASSERT(TS_ALL_WAITERS(ts) == 0); 401 KASSERT(LIST_EMPTY(&ts->ts_sleepq[TS_READER_Q]) && 402 LIST_EMPTY(&ts->ts_sleepq[TS_WRITER_Q])); 403 ts->ts_obj = obj; 404 ts->ts_inheritor = NULL; 405 LIST_INSERT_HEAD(tc, ts, ts_chain); 406 } else { 407 /* 408 * Object already has a turnstile. Put our turnstile 409 * onto the free list, and reference the existing 410 * turnstile instead. 411 */ 412 ots = l->l_ts; 413 KASSERT(ots->ts_free == NULL); 414 ots->ts_free = ts->ts_free; 415 ts->ts_free = ots; 416 l->l_ts = ts; 417 418 KASSERT(ts->ts_obj == obj); 419 KASSERT(TS_ALL_WAITERS(ts) != 0); 420 KASSERT(!LIST_EMPTY(&ts->ts_sleepq[TS_READER_Q]) || 421 !LIST_EMPTY(&ts->ts_sleepq[TS_WRITER_Q])); 422 } 423 424 sq = &ts->ts_sleepq[q]; 425 ts->ts_waiters[q]++; 426 sleepq_enter(sq, l, lock); 427 LOCKDEBUG_BARRIER(lock, 1); 428 l->l_kpriority = true; 429 obase = l->l_kpribase; 430 if (obase < PRI_KTHREAD) 431 l->l_kpribase = PRI_KTHREAD; 432 sleepq_enqueue(sq, obj, "tstile", sobj); 433 434 /* 435 * Disable preemption across this entire block, as we may drop 436 * scheduler locks (allowing preemption), and would prefer not 437 * to be interrupted while in a state of flux. 438 */ 439 KPREEMPT_DISABLE(l); 440 KASSERT(lock == l->l_mutex); 441 turnstile_lendpri(l); 442 sleepq_block(0, false); 443 l->l_kpribase = obase; 444 KPREEMPT_ENABLE(l); 445 } 446 447 /* 448 * turnstile_wakeup: 449 * 450 * Wake up the specified number of threads that are blocked 451 * in a turnstile. 452 */ 453 void 454 turnstile_wakeup(turnstile_t *ts, int q, int count, lwp_t *nl) 455 { 456 sleepq_t *sq; 457 kmutex_t *lock; 458 u_int hash; 459 lwp_t *l; 460 461 hash = TS_HASH(ts->ts_obj); 462 lock = &turnstile_locks[hash].lock; 463 sq = &ts->ts_sleepq[q]; 464 465 KASSERT(q == TS_READER_Q || q == TS_WRITER_Q); 466 KASSERT(count > 0 && count <= TS_WAITERS(ts, q)); 467 KASSERT(mutex_owned(lock)); 468 KASSERT(ts->ts_inheritor == curlwp || ts->ts_inheritor == NULL); 469 470 /* 471 * restore inherited priority if necessary. 472 */ 473 474 if (ts->ts_inheritor != NULL) { 475 turnstile_unlendpri(ts); 476 } 477 478 if (nl != NULL) { 479 #if defined(DEBUG) || defined(LOCKDEBUG) 480 LIST_FOREACH(l, sq, l_sleepchain) { 481 if (l == nl) 482 break; 483 } 484 if (l == NULL) 485 panic("turnstile_wakeup: nl not on sleepq"); 486 #endif 487 turnstile_remove(ts, nl, q); 488 } else { 489 while (count-- > 0) { 490 l = LIST_FIRST(sq); 491 KASSERT(l != NULL); 492 turnstile_remove(ts, l, q); 493 } 494 } 495 mutex_spin_exit(lock); 496 } 497 498 /* 499 * turnstile_unsleep: 500 * 501 * Remove an LWP from the turnstile. This is called when the LWP has 502 * not been awoken normally but instead interrupted: for example, if it 503 * has received a signal. It's not a valid action for turnstiles, 504 * since LWPs blocking on a turnstile are not interruptable. 505 */ 506 void 507 turnstile_unsleep(lwp_t *l, bool cleanup) 508 { 509 510 lwp_unlock(l); 511 panic("turnstile_unsleep"); 512 } 513 514 /* 515 * turnstile_changepri: 516 * 517 * Adjust the priority of an LWP residing on a turnstile. 518 */ 519 void 520 turnstile_changepri(lwp_t *l, pri_t pri) 521 { 522 523 /* XXX priority inheritance */ 524 sleepq_changepri(l, pri); 525 } 526 527 #if defined(LOCKDEBUG) 528 /* 529 * turnstile_print: 530 * 531 * Given the address of a lock object, print the contents of a 532 * turnstile. 533 */ 534 void 535 turnstile_print(volatile void *obj, void (*pr)(const char *, ...)) 536 { 537 turnstile_t *ts; 538 tschain_t *tc; 539 sleepq_t *rsq, *wsq; 540 u_int hash; 541 lwp_t *l; 542 543 hash = TS_HASH(obj); 544 tc = &turnstile_chains[hash]; 545 546 LIST_FOREACH(ts, tc, ts_chain) 547 if (ts->ts_obj == obj) 548 break; 549 550 if (ts == NULL) { 551 (*pr)("Turnstile: no active turnstile for this lock.\n"); 552 return; 553 } 554 555 rsq = &ts->ts_sleepq[TS_READER_Q]; 556 wsq = &ts->ts_sleepq[TS_WRITER_Q]; 557 558 (*pr)("Turnstile:\n"); 559 (*pr)("=> %d waiting readers:", TS_WAITERS(ts, TS_READER_Q)); 560 LIST_FOREACH(l, rsq, l_sleepchain) { 561 (*pr)(" %p", l); 562 } 563 (*pr)("\n"); 564 565 (*pr)("=> %d waiting writers:", TS_WAITERS(ts, TS_WRITER_Q)); 566 LIST_FOREACH(l, wsq, l_sleepchain) { 567 (*pr)(" %p", l); 568 } 569 (*pr)("\n"); 570 } 571 #endif /* LOCKDEBUG */ 572