1 /* $NetBSD: kern_turnstile.c,v 1.32 2012/06/15 13:51:40 yamt Exp $ */ 2 3 /*- 4 * Copyright (c) 2002, 2006, 2007, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe and Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Turnstiles are described in detail in: 34 * 35 * Solaris Internals: Core Kernel Architecture, Jim Mauro and 36 * Richard McDougall. 37 * 38 * Turnstiles are kept in a hash table. There are likely to be many more 39 * synchronisation objects than there are threads. Since a thread can block 40 * on only one lock at a time, we only need one turnstile per thread, and 41 * so they are allocated at thread creation time. 42 * 43 * When a thread decides it needs to block on a lock, it looks up the 44 * active turnstile for that lock. If no active turnstile exists, then 45 * the process lends its turnstile to the lock. If there is already an 46 * active turnstile for the lock, the thread places its turnstile on a 47 * list of free turnstiles, and references the active one instead. 48 * 49 * The act of looking up the turnstile acquires an interlock on the sleep 50 * queue. If a thread decides it doesn't need to block after all, then this 51 * interlock must be released by explicitly aborting the turnstile 52 * operation. 53 * 54 * When a thread is awakened, it needs to get its turnstile back. If there 55 * are still other threads waiting in the active turnstile, the thread 56 * grabs a free turnstile off the free list. Otherwise, it can take back 57 * the active turnstile from the lock (thus deactivating the turnstile). 58 * 59 * Turnstiles are the place to do priority inheritence. 60 */ 61 62 #include <sys/cdefs.h> 63 __KERNEL_RCSID(0, "$NetBSD: kern_turnstile.c,v 1.32 2012/06/15 13:51:40 yamt Exp $"); 64 65 #include <sys/param.h> 66 #include <sys/lockdebug.h> 67 #include <sys/pool.h> 68 #include <sys/proc.h> 69 #include <sys/sleepq.h> 70 #include <sys/systm.h> 71 72 #define TS_HASH_SIZE 64 73 #define TS_HASH_MASK (TS_HASH_SIZE - 1) 74 #define TS_HASH(obj) (((uintptr_t)(obj) >> 3) & TS_HASH_MASK) 75 76 static tschain_t turnstile_tab[TS_HASH_SIZE] __cacheline_aligned; 77 pool_cache_t turnstile_cache __read_mostly; 78 79 static int turnstile_ctor(void *, void *, int); 80 81 extern turnstile_t turnstile0; 82 83 /* 84 * turnstile_init: 85 * 86 * Initialize the turnstile mechanism. 87 */ 88 void 89 turnstile_init(void) 90 { 91 tschain_t *tc; 92 int i; 93 94 for (i = 0; i < TS_HASH_SIZE; i++) { 95 tc = &turnstile_tab[i]; 96 LIST_INIT(&tc->tc_chain); 97 tc->tc_mutex = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); 98 } 99 100 turnstile_cache = pool_cache_init(sizeof(turnstile_t), 0, 0, 0, 101 "tstilepl", NULL, IPL_NONE, turnstile_ctor, NULL, NULL); 102 KASSERT(turnstile_cache != NULL); 103 104 (void)turnstile_ctor(NULL, &turnstile0, 0); 105 } 106 107 /* 108 * turnstile_ctor: 109 * 110 * Constructor for turnstiles. 111 */ 112 static int 113 turnstile_ctor(void *arg, void *obj, int flags) 114 { 115 turnstile_t *ts = obj; 116 117 memset(ts, 0, sizeof(*ts)); 118 sleepq_init(&ts->ts_sleepq[TS_READER_Q]); 119 sleepq_init(&ts->ts_sleepq[TS_WRITER_Q]); 120 return (0); 121 } 122 123 /* 124 * turnstile_remove: 125 * 126 * Remove an LWP from a turnstile sleep queue and wake it. 127 */ 128 static inline void 129 turnstile_remove(turnstile_t *ts, lwp_t *l, int q) 130 { 131 turnstile_t *nts; 132 133 KASSERT(l->l_ts == ts); 134 135 /* 136 * This process is no longer using the active turnstile. 137 * Find an inactive one on the free list to give to it. 138 */ 139 if ((nts = ts->ts_free) != NULL) { 140 KASSERT(TS_ALL_WAITERS(ts) > 1); 141 l->l_ts = nts; 142 ts->ts_free = nts->ts_free; 143 nts->ts_free = NULL; 144 } else { 145 /* 146 * If the free list is empty, this is the last 147 * waiter. 148 */ 149 KASSERT(TS_ALL_WAITERS(ts) == 1); 150 LIST_REMOVE(ts, ts_chain); 151 } 152 153 ts->ts_waiters[q]--; 154 sleepq_remove(&ts->ts_sleepq[q], l); 155 } 156 157 /* 158 * turnstile_lookup: 159 * 160 * Look up the turnstile for the specified lock. This acquires and 161 * holds the turnstile chain lock (sleep queue interlock). 162 */ 163 turnstile_t * 164 turnstile_lookup(wchan_t obj) 165 { 166 turnstile_t *ts; 167 tschain_t *tc; 168 169 tc = &turnstile_tab[TS_HASH(obj)]; 170 mutex_spin_enter(tc->tc_mutex); 171 172 LIST_FOREACH(ts, &tc->tc_chain, ts_chain) 173 if (ts->ts_obj == obj) 174 return (ts); 175 176 /* 177 * No turnstile yet for this lock. No problem, turnstile_block() 178 * handles this by fetching the turnstile from the blocking thread. 179 */ 180 return (NULL); 181 } 182 183 /* 184 * turnstile_exit: 185 * 186 * Abort a turnstile operation. 187 */ 188 void 189 turnstile_exit(wchan_t obj) 190 { 191 tschain_t *tc; 192 193 tc = &turnstile_tab[TS_HASH(obj)]; 194 mutex_spin_exit(tc->tc_mutex); 195 } 196 197 /* 198 * turnstile_lendpri: 199 * 200 * Lend our priority to lwps on the blocking chain. 201 * 202 * If the current owner of the lock (l->l_wchan, set by sleepq_enqueue) 203 * has a priority lower than ours (lwp_eprio(l)), lend our priority to 204 * him to avoid priority inversions. 205 */ 206 207 static void 208 turnstile_lendpri(lwp_t *cur) 209 { 210 lwp_t * l = cur; 211 pri_t prio; 212 213 /* 214 * NOTE: if you get a panic in this code block, it is likely that 215 * a lock has been destroyed or corrupted while still in use. Try 216 * compiling a kernel with LOCKDEBUG to pinpoint the problem. 217 */ 218 219 LOCKDEBUG_BARRIER(l->l_mutex, 1); 220 KASSERT(l == curlwp); 221 prio = lwp_eprio(l); 222 for (;;) { 223 lwp_t *owner; 224 turnstile_t *ts; 225 bool dolock; 226 227 if (l->l_wchan == NULL) 228 break; 229 230 /* 231 * Ask syncobj the owner of the lock. 232 */ 233 owner = (*l->l_syncobj->sobj_owner)(l->l_wchan); 234 if (owner == NULL) 235 break; 236 237 /* 238 * The owner may have changed as we have dropped the tc lock. 239 */ 240 if (cur == owner) { 241 /* 242 * We own the lock: stop here, sleepq_block() 243 * should wake up immediatly. 244 */ 245 break; 246 } 247 /* 248 * Acquire owner->l_mutex if we don't have it yet. 249 * Because we already have another LWP lock (l->l_mutex) held, 250 * we need to play a try lock dance to avoid deadlock. 251 */ 252 dolock = l->l_mutex != owner->l_mutex; 253 if (l == owner || (dolock && !lwp_trylock(owner))) { 254 /* 255 * The owner was changed behind us or trylock failed. 256 * Restart from curlwp. 257 * 258 * Note that there may be a livelock here: 259 * the owner may try grabing cur's lock (which is the 260 * tc lock) while we're trying to grab the owner's lock. 261 */ 262 lwp_unlock(l); 263 l = cur; 264 lwp_lock(l); 265 prio = lwp_eprio(l); 266 continue; 267 } 268 /* 269 * If the owner's priority is already higher than ours, 270 * there's nothing to do anymore. 271 */ 272 if (prio <= lwp_eprio(owner)) { 273 if (dolock) 274 lwp_unlock(owner); 275 break; 276 } 277 /* 278 * Lend our priority to the 'owner' LWP. 279 * 280 * Update lenders info for turnstile_unlendpri. 281 */ 282 ts = l->l_ts; 283 KASSERT(ts->ts_inheritor == owner || ts->ts_inheritor == NULL); 284 if (ts->ts_inheritor == NULL) { 285 ts->ts_inheritor = owner; 286 ts->ts_eprio = prio; 287 SLIST_INSERT_HEAD(&owner->l_pi_lenders, ts, ts_pichain); 288 lwp_lendpri(owner, prio); 289 } else if (prio > ts->ts_eprio) { 290 ts->ts_eprio = prio; 291 lwp_lendpri(owner, prio); 292 } 293 if (dolock) 294 lwp_unlock(l); 295 LOCKDEBUG_BARRIER(owner->l_mutex, 1); 296 l = owner; 297 } 298 LOCKDEBUG_BARRIER(l->l_mutex, 1); 299 if (cur->l_mutex != l->l_mutex) { 300 lwp_unlock(l); 301 lwp_lock(cur); 302 } 303 LOCKDEBUG_BARRIER(cur->l_mutex, 1); 304 } 305 306 /* 307 * turnstile_unlendpri: undo turnstile_lendpri 308 */ 309 310 static void 311 turnstile_unlendpri(turnstile_t *ts) 312 { 313 lwp_t * const l = curlwp; 314 turnstile_t *iter; 315 turnstile_t *next; 316 turnstile_t *prev = NULL; 317 pri_t prio; 318 bool dolock; 319 320 KASSERT(ts->ts_inheritor != NULL); 321 ts->ts_inheritor = NULL; 322 dolock = l->l_mutex == l->l_cpu->ci_schedstate.spc_lwplock; 323 if (dolock) { 324 lwp_lock(l); 325 } 326 327 /* 328 * the following loop does two things. 329 * 330 * - remove ts from the list. 331 * 332 * - from the rest of the list, find the highest priority. 333 */ 334 335 prio = -1; 336 KASSERT(!SLIST_EMPTY(&l->l_pi_lenders)); 337 for (iter = SLIST_FIRST(&l->l_pi_lenders); 338 iter != NULL; iter = next) { 339 KASSERT(lwp_eprio(l) >= ts->ts_eprio); 340 next = SLIST_NEXT(iter, ts_pichain); 341 if (iter == ts) { 342 if (prev == NULL) { 343 SLIST_REMOVE_HEAD(&l->l_pi_lenders, 344 ts_pichain); 345 } else { 346 SLIST_REMOVE_AFTER(prev, ts_pichain); 347 } 348 } else if (prio < iter->ts_eprio) { 349 prio = iter->ts_eprio; 350 } 351 prev = iter; 352 } 353 354 lwp_lendpri(l, prio); 355 356 if (dolock) { 357 lwp_unlock(l); 358 } 359 } 360 361 /* 362 * turnstile_block: 363 * 364 * Enter an object into the turnstile chain and prepare the current 365 * LWP for sleep. 366 */ 367 void 368 turnstile_block(turnstile_t *ts, int q, wchan_t obj, syncobj_t *sobj) 369 { 370 lwp_t * const l = curlwp; /* cached curlwp */ 371 turnstile_t *ots; 372 tschain_t *tc; 373 sleepq_t *sq; 374 pri_t obase; 375 376 tc = &turnstile_tab[TS_HASH(obj)]; 377 378 KASSERT(q == TS_READER_Q || q == TS_WRITER_Q); 379 KASSERT(mutex_owned(tc->tc_mutex)); 380 KASSERT(l != NULL && l->l_ts != NULL); 381 382 if (ts == NULL) { 383 /* 384 * We are the first thread to wait for this object; 385 * lend our turnstile to it. 386 */ 387 ts = l->l_ts; 388 KASSERT(TS_ALL_WAITERS(ts) == 0); 389 KASSERT(TAILQ_EMPTY(&ts->ts_sleepq[TS_READER_Q]) && 390 TAILQ_EMPTY(&ts->ts_sleepq[TS_WRITER_Q])); 391 ts->ts_obj = obj; 392 ts->ts_inheritor = NULL; 393 LIST_INSERT_HEAD(&tc->tc_chain, ts, ts_chain); 394 } else { 395 /* 396 * Object already has a turnstile. Put our turnstile 397 * onto the free list, and reference the existing 398 * turnstile instead. 399 */ 400 ots = l->l_ts; 401 KASSERT(ots->ts_free == NULL); 402 ots->ts_free = ts->ts_free; 403 ts->ts_free = ots; 404 l->l_ts = ts; 405 406 KASSERT(ts->ts_obj == obj); 407 KASSERT(TS_ALL_WAITERS(ts) != 0); 408 KASSERT(!TAILQ_EMPTY(&ts->ts_sleepq[TS_READER_Q]) || 409 !TAILQ_EMPTY(&ts->ts_sleepq[TS_WRITER_Q])); 410 } 411 412 sq = &ts->ts_sleepq[q]; 413 ts->ts_waiters[q]++; 414 sleepq_enter(sq, l, tc->tc_mutex); 415 LOCKDEBUG_BARRIER(tc->tc_mutex, 1); 416 l->l_kpriority = true; 417 obase = l->l_kpribase; 418 if (obase < PRI_KTHREAD) 419 l->l_kpribase = PRI_KTHREAD; 420 sleepq_enqueue(sq, obj, "tstile", sobj); 421 422 /* 423 * Disable preemption across this entire block, as we may drop 424 * scheduler locks (allowing preemption), and would prefer not 425 * to be interrupted while in a state of flux. 426 */ 427 KPREEMPT_DISABLE(l); 428 KASSERT(tc->tc_mutex == l->l_mutex); 429 turnstile_lendpri(l); 430 sleepq_block(0, false); 431 l->l_kpribase = obase; 432 KPREEMPT_ENABLE(l); 433 } 434 435 /* 436 * turnstile_wakeup: 437 * 438 * Wake up the specified number of threads that are blocked 439 * in a turnstile. 440 */ 441 void 442 turnstile_wakeup(turnstile_t *ts, int q, int count, lwp_t *nl) 443 { 444 sleepq_t *sq; 445 tschain_t *tc; 446 lwp_t *l; 447 448 tc = &turnstile_tab[TS_HASH(ts->ts_obj)]; 449 sq = &ts->ts_sleepq[q]; 450 451 KASSERT(q == TS_READER_Q || q == TS_WRITER_Q); 452 KASSERT(count > 0 && count <= TS_WAITERS(ts, q)); 453 KASSERT(mutex_owned(tc->tc_mutex)); 454 KASSERT(ts->ts_inheritor == curlwp || ts->ts_inheritor == NULL); 455 456 /* 457 * restore inherited priority if necessary. 458 */ 459 460 if (ts->ts_inheritor != NULL) { 461 turnstile_unlendpri(ts); 462 } 463 464 if (nl != NULL) { 465 #if defined(DEBUG) || defined(LOCKDEBUG) 466 TAILQ_FOREACH(l, sq, l_sleepchain) { 467 if (l == nl) 468 break; 469 } 470 if (l == NULL) 471 panic("turnstile_wakeup: nl not on sleepq"); 472 #endif 473 turnstile_remove(ts, nl, q); 474 } else { 475 while (count-- > 0) { 476 l = TAILQ_FIRST(sq); 477 KASSERT(l != NULL); 478 turnstile_remove(ts, l, q); 479 } 480 } 481 mutex_spin_exit(tc->tc_mutex); 482 } 483 484 /* 485 * turnstile_unsleep: 486 * 487 * Remove an LWP from the turnstile. This is called when the LWP has 488 * not been awoken normally but instead interrupted: for example, if it 489 * has received a signal. It's not a valid action for turnstiles, 490 * since LWPs blocking on a turnstile are not interruptable. 491 */ 492 void 493 turnstile_unsleep(lwp_t *l, bool cleanup) 494 { 495 496 lwp_unlock(l); 497 panic("turnstile_unsleep"); 498 } 499 500 /* 501 * turnstile_changepri: 502 * 503 * Adjust the priority of an LWP residing on a turnstile. 504 */ 505 void 506 turnstile_changepri(lwp_t *l, pri_t pri) 507 { 508 509 /* XXX priority inheritance */ 510 sleepq_changepri(l, pri); 511 } 512 513 #if defined(LOCKDEBUG) 514 /* 515 * turnstile_print: 516 * 517 * Given the address of a lock object, print the contents of a 518 * turnstile. 519 */ 520 void 521 turnstile_print(volatile void *obj, void (*pr)(const char *, ...)) 522 { 523 turnstile_t *ts; 524 tschain_t *tc; 525 sleepq_t *rsq, *wsq; 526 lwp_t *l; 527 528 tc = &turnstile_tab[TS_HASH(obj)]; 529 530 LIST_FOREACH(ts, &tc->tc_chain, ts_chain) 531 if (ts->ts_obj == obj) 532 break; 533 534 (*pr)("Turnstile chain at %p.\n", tc); 535 if (ts == NULL) { 536 (*pr)("=> No active turnstile for this lock.\n"); 537 return; 538 } 539 540 rsq = &ts->ts_sleepq[TS_READER_Q]; 541 wsq = &ts->ts_sleepq[TS_WRITER_Q]; 542 543 (*pr)("=> Turnstile at %p (wrq=%p, rdq=%p).\n", ts, rsq, wsq); 544 545 (*pr)("=> %d waiting readers:", TS_WAITERS(ts, TS_READER_Q)); 546 TAILQ_FOREACH(l, rsq, l_sleepchain) { 547 (*pr)(" %p", l); 548 } 549 (*pr)("\n"); 550 551 (*pr)("=> %d waiting writers:", TS_WAITERS(ts, TS_WRITER_Q)); 552 TAILQ_FOREACH(l, wsq, l_sleepchain) { 553 (*pr)(" %p", l); 554 } 555 (*pr)("\n"); 556 } 557 #endif /* LOCKDEBUG */ 558