1 /* $NetBSD: kern_turnstile.c,v 1.31 2011/12/02 12:31:53 yamt Exp $ */ 2 3 /*- 4 * Copyright (c) 2002, 2006, 2007, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe and Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Turnstiles are described in detail in: 34 * 35 * Solaris Internals: Core Kernel Architecture, Jim Mauro and 36 * Richard McDougall. 37 * 38 * Turnstiles are kept in a hash table. There are likely to be many more 39 * synchronisation objects than there are threads. Since a thread can block 40 * on only one lock at a time, we only need one turnstile per thread, and 41 * so they are allocated at thread creation time. 42 * 43 * When a thread decides it needs to block on a lock, it looks up the 44 * active turnstile for that lock. If no active turnstile exists, then 45 * the process lends its turnstile to the lock. If there is already an 46 * active turnstile for the lock, the thread places its turnstile on a 47 * list of free turnstiles, and references the active one instead. 48 * 49 * The act of looking up the turnstile acquires an interlock on the sleep 50 * queue. If a thread decides it doesn't need to block after all, then this 51 * interlock must be released by explicitly aborting the turnstile 52 * operation. 53 * 54 * When a thread is awakened, it needs to get its turnstile back. If there 55 * are still other threads waiting in the active turnstile, the thread 56 * grabs a free turnstile off the free list. Otherwise, it can take back 57 * the active turnstile from the lock (thus deactivating the turnstile). 58 * 59 * Turnstiles are the place to do priority inheritence. 60 */ 61 62 #include <sys/cdefs.h> 63 __KERNEL_RCSID(0, "$NetBSD: kern_turnstile.c,v 1.31 2011/12/02 12:31:53 yamt Exp $"); 64 65 #include <sys/param.h> 66 #include <sys/lockdebug.h> 67 #include <sys/pool.h> 68 #include <sys/proc.h> 69 #include <sys/sleepq.h> 70 #include <sys/systm.h> 71 72 #define TS_HASH_SIZE 64 73 #define TS_HASH_MASK (TS_HASH_SIZE - 1) 74 #define TS_HASH(obj) (((uintptr_t)(obj) >> 3) & TS_HASH_MASK) 75 76 static tschain_t turnstile_tab[TS_HASH_SIZE] __cacheline_aligned; 77 pool_cache_t turnstile_cache __read_mostly; 78 79 static int turnstile_ctor(void *, void *, int); 80 81 extern turnstile_t turnstile0; 82 83 /* 84 * turnstile_init: 85 * 86 * Initialize the turnstile mechanism. 87 */ 88 void 89 turnstile_init(void) 90 { 91 tschain_t *tc; 92 int i; 93 94 for (i = 0; i < TS_HASH_SIZE; i++) { 95 tc = &turnstile_tab[i]; 96 LIST_INIT(&tc->tc_chain); 97 tc->tc_mutex = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); 98 } 99 100 turnstile_cache = pool_cache_init(sizeof(turnstile_t), 0, 0, 0, 101 "tstilepl", NULL, IPL_NONE, turnstile_ctor, NULL, NULL); 102 KASSERT(turnstile_cache != NULL); 103 104 (void)turnstile_ctor(NULL, &turnstile0, 0); 105 } 106 107 /* 108 * turnstile_ctor: 109 * 110 * Constructor for turnstiles. 111 */ 112 static int 113 turnstile_ctor(void *arg, void *obj, int flags) 114 { 115 turnstile_t *ts = obj; 116 117 memset(ts, 0, sizeof(*ts)); 118 sleepq_init(&ts->ts_sleepq[TS_READER_Q]); 119 sleepq_init(&ts->ts_sleepq[TS_WRITER_Q]); 120 return (0); 121 } 122 123 /* 124 * turnstile_remove: 125 * 126 * Remove an LWP from a turnstile sleep queue and wake it. 127 */ 128 static inline void 129 turnstile_remove(turnstile_t *ts, lwp_t *l, int q) 130 { 131 turnstile_t *nts; 132 133 KASSERT(l->l_ts == ts); 134 135 /* 136 * This process is no longer using the active turnstile. 137 * Find an inactive one on the free list to give to it. 138 */ 139 if ((nts = ts->ts_free) != NULL) { 140 KASSERT(TS_ALL_WAITERS(ts) > 1); 141 l->l_ts = nts; 142 ts->ts_free = nts->ts_free; 143 nts->ts_free = NULL; 144 } else { 145 /* 146 * If the free list is empty, this is the last 147 * waiter. 148 */ 149 KASSERT(TS_ALL_WAITERS(ts) == 1); 150 LIST_REMOVE(ts, ts_chain); 151 } 152 153 ts->ts_waiters[q]--; 154 sleepq_remove(&ts->ts_sleepq[q], l); 155 } 156 157 /* 158 * turnstile_lookup: 159 * 160 * Look up the turnstile for the specified lock. This acquires and 161 * holds the turnstile chain lock (sleep queue interlock). 162 */ 163 turnstile_t * 164 turnstile_lookup(wchan_t obj) 165 { 166 turnstile_t *ts; 167 tschain_t *tc; 168 169 tc = &turnstile_tab[TS_HASH(obj)]; 170 mutex_spin_enter(tc->tc_mutex); 171 172 LIST_FOREACH(ts, &tc->tc_chain, ts_chain) 173 if (ts->ts_obj == obj) 174 return (ts); 175 176 /* 177 * No turnstile yet for this lock. No problem, turnstile_block() 178 * handles this by fetching the turnstile from the blocking thread. 179 */ 180 return (NULL); 181 } 182 183 /* 184 * turnstile_exit: 185 * 186 * Abort a turnstile operation. 187 */ 188 void 189 turnstile_exit(wchan_t obj) 190 { 191 tschain_t *tc; 192 193 tc = &turnstile_tab[TS_HASH(obj)]; 194 mutex_spin_exit(tc->tc_mutex); 195 } 196 197 /* 198 * turnstile_lendpri: 199 * 200 * Lend our priority to lwps on the blocking chain. 201 * 202 * 203 */ 204 205 static void 206 turnstile_lendpri(lwp_t *cur) 207 { 208 lwp_t * l = cur; 209 pri_t prio; 210 211 /* 212 * NOTE: if you get a panic in this code block, it is likely that 213 * a lock has been destroyed or corrupted while still in use. Try 214 * compiling a kernel with LOCKDEBUG to pinpoint the problem. 215 */ 216 217 LOCKDEBUG_BARRIER(l->l_mutex, 1); 218 KASSERT(l == curlwp); 219 prio = lwp_eprio(l); 220 for (;;) { 221 lwp_t *owner; 222 turnstile_t *ts; 223 bool dolock; 224 225 if (l->l_wchan == NULL) 226 break; 227 228 owner = (*l->l_syncobj->sobj_owner)(l->l_wchan); 229 if (owner == NULL) 230 break; 231 232 /* The owner may have changed as we have dropped the tc lock */ 233 if (cur == owner) { 234 /* 235 * we own the lock: stop here, sleepq_block() 236 * should wake up immediatly 237 */ 238 break; 239 } 240 if (l->l_mutex != owner->l_mutex) 241 dolock = true; 242 else 243 dolock = false; 244 if (l == owner || (dolock && !lwp_trylock(owner))) { 245 /* 246 * restart from curlwp. 247 * Note that there may be a livelock here: 248 * the owner may try grabing cur's lock (which is 249 * the tc lock) while we're trying to grab 250 * the owner's lock. 251 */ 252 lwp_unlock(l); 253 l = cur; 254 lwp_lock(l); 255 prio = lwp_eprio(l); 256 continue; 257 } 258 if (prio <= lwp_eprio(owner)) { 259 if (dolock) 260 lwp_unlock(owner); 261 break; 262 } 263 ts = l->l_ts; 264 KASSERT(ts->ts_inheritor == owner || ts->ts_inheritor == NULL); 265 if (ts->ts_inheritor == NULL) { 266 ts->ts_inheritor = owner; 267 ts->ts_eprio = prio; 268 SLIST_INSERT_HEAD(&owner->l_pi_lenders, ts, ts_pichain); 269 lwp_lendpri(owner, prio); 270 } else if (prio > ts->ts_eprio) { 271 ts->ts_eprio = prio; 272 lwp_lendpri(owner, prio); 273 } 274 if (dolock) 275 lwp_unlock(l); 276 l = owner; 277 } 278 LOCKDEBUG_BARRIER(l->l_mutex, 1); 279 if (cur->l_mutex != l->l_mutex) { 280 lwp_unlock(l); 281 lwp_lock(cur); 282 } 283 LOCKDEBUG_BARRIER(cur->l_mutex, 1); 284 } 285 286 /* 287 * turnstile_unlendpri: undo turnstile_lendpri 288 */ 289 290 static void 291 turnstile_unlendpri(turnstile_t *ts) 292 { 293 lwp_t * const l = curlwp; 294 turnstile_t *iter; 295 turnstile_t *next; 296 turnstile_t *prev = NULL; 297 pri_t prio; 298 bool dolock; 299 300 KASSERT(ts->ts_inheritor != NULL); 301 ts->ts_inheritor = NULL; 302 dolock = l->l_mutex == l->l_cpu->ci_schedstate.spc_lwplock; 303 if (dolock) { 304 lwp_lock(l); 305 } 306 307 /* 308 * the following loop does two things. 309 * 310 * - remove ts from the list. 311 * 312 * - from the rest of the list, find the highest priority. 313 */ 314 315 prio = -1; 316 KASSERT(!SLIST_EMPTY(&l->l_pi_lenders)); 317 for (iter = SLIST_FIRST(&l->l_pi_lenders); 318 iter != NULL; iter = next) { 319 KASSERT(lwp_eprio(l) >= ts->ts_eprio); 320 next = SLIST_NEXT(iter, ts_pichain); 321 if (iter == ts) { 322 if (prev == NULL) { 323 SLIST_REMOVE_HEAD(&l->l_pi_lenders, 324 ts_pichain); 325 } else { 326 SLIST_REMOVE_AFTER(prev, ts_pichain); 327 } 328 } else if (prio < iter->ts_eprio) { 329 prio = iter->ts_eprio; 330 } 331 prev = iter; 332 } 333 334 lwp_lendpri(l, prio); 335 336 if (dolock) { 337 lwp_unlock(l); 338 } 339 } 340 341 /* 342 * turnstile_block: 343 * 344 * Enter an object into the turnstile chain and prepare the current 345 * LWP for sleep. 346 */ 347 void 348 turnstile_block(turnstile_t *ts, int q, wchan_t obj, syncobj_t *sobj) 349 { 350 lwp_t * const l = curlwp; /* cached curlwp */ 351 turnstile_t *ots; 352 tschain_t *tc; 353 sleepq_t *sq; 354 pri_t obase; 355 356 tc = &turnstile_tab[TS_HASH(obj)]; 357 358 KASSERT(q == TS_READER_Q || q == TS_WRITER_Q); 359 KASSERT(mutex_owned(tc->tc_mutex)); 360 KASSERT(l != NULL && l->l_ts != NULL); 361 362 if (ts == NULL) { 363 /* 364 * We are the first thread to wait for this object; 365 * lend our turnstile to it. 366 */ 367 ts = l->l_ts; 368 KASSERT(TS_ALL_WAITERS(ts) == 0); 369 KASSERT(TAILQ_EMPTY(&ts->ts_sleepq[TS_READER_Q]) && 370 TAILQ_EMPTY(&ts->ts_sleepq[TS_WRITER_Q])); 371 ts->ts_obj = obj; 372 ts->ts_inheritor = NULL; 373 LIST_INSERT_HEAD(&tc->tc_chain, ts, ts_chain); 374 } else { 375 /* 376 * Object already has a turnstile. Put our turnstile 377 * onto the free list, and reference the existing 378 * turnstile instead. 379 */ 380 ots = l->l_ts; 381 KASSERT(ots->ts_free == NULL); 382 ots->ts_free = ts->ts_free; 383 ts->ts_free = ots; 384 l->l_ts = ts; 385 386 KASSERT(ts->ts_obj == obj); 387 KASSERT(TS_ALL_WAITERS(ts) != 0); 388 KASSERT(!TAILQ_EMPTY(&ts->ts_sleepq[TS_READER_Q]) || 389 !TAILQ_EMPTY(&ts->ts_sleepq[TS_WRITER_Q])); 390 } 391 392 sq = &ts->ts_sleepq[q]; 393 ts->ts_waiters[q]++; 394 sleepq_enter(sq, l, tc->tc_mutex); 395 LOCKDEBUG_BARRIER(tc->tc_mutex, 1); 396 l->l_kpriority = true; 397 obase = l->l_kpribase; 398 if (obase < PRI_KTHREAD) 399 l->l_kpribase = PRI_KTHREAD; 400 sleepq_enqueue(sq, obj, "tstile", sobj); 401 402 /* 403 * Disable preemption across this entire block, as we may drop 404 * scheduler locks (allowing preemption), and would prefer not 405 * to be interrupted while in a state of flux. 406 */ 407 KPREEMPT_DISABLE(l); 408 KASSERT(tc->tc_mutex == l->l_mutex); 409 turnstile_lendpri(l); 410 sleepq_block(0, false); 411 l->l_kpribase = obase; 412 KPREEMPT_ENABLE(l); 413 } 414 415 /* 416 * turnstile_wakeup: 417 * 418 * Wake up the specified number of threads that are blocked 419 * in a turnstile. 420 */ 421 void 422 turnstile_wakeup(turnstile_t *ts, int q, int count, lwp_t *nl) 423 { 424 sleepq_t *sq; 425 tschain_t *tc; 426 lwp_t *l; 427 428 tc = &turnstile_tab[TS_HASH(ts->ts_obj)]; 429 sq = &ts->ts_sleepq[q]; 430 431 KASSERT(q == TS_READER_Q || q == TS_WRITER_Q); 432 KASSERT(count > 0 && count <= TS_WAITERS(ts, q)); 433 KASSERT(mutex_owned(tc->tc_mutex)); 434 KASSERT(ts->ts_inheritor == curlwp || ts->ts_inheritor == NULL); 435 436 /* 437 * restore inherited priority if necessary. 438 */ 439 440 if (ts->ts_inheritor != NULL) { 441 turnstile_unlendpri(ts); 442 } 443 444 if (nl != NULL) { 445 #if defined(DEBUG) || defined(LOCKDEBUG) 446 TAILQ_FOREACH(l, sq, l_sleepchain) { 447 if (l == nl) 448 break; 449 } 450 if (l == NULL) 451 panic("turnstile_wakeup: nl not on sleepq"); 452 #endif 453 turnstile_remove(ts, nl, q); 454 } else { 455 while (count-- > 0) { 456 l = TAILQ_FIRST(sq); 457 KASSERT(l != NULL); 458 turnstile_remove(ts, l, q); 459 } 460 } 461 mutex_spin_exit(tc->tc_mutex); 462 } 463 464 /* 465 * turnstile_unsleep: 466 * 467 * Remove an LWP from the turnstile. This is called when the LWP has 468 * not been awoken normally but instead interrupted: for example, if it 469 * has received a signal. It's not a valid action for turnstiles, 470 * since LWPs blocking on a turnstile are not interruptable. 471 */ 472 void 473 turnstile_unsleep(lwp_t *l, bool cleanup) 474 { 475 476 lwp_unlock(l); 477 panic("turnstile_unsleep"); 478 } 479 480 /* 481 * turnstile_changepri: 482 * 483 * Adjust the priority of an LWP residing on a turnstile. 484 */ 485 void 486 turnstile_changepri(lwp_t *l, pri_t pri) 487 { 488 489 /* XXX priority inheritance */ 490 sleepq_changepri(l, pri); 491 } 492 493 #if defined(LOCKDEBUG) 494 /* 495 * turnstile_print: 496 * 497 * Given the address of a lock object, print the contents of a 498 * turnstile. 499 */ 500 void 501 turnstile_print(volatile void *obj, void (*pr)(const char *, ...)) 502 { 503 turnstile_t *ts; 504 tschain_t *tc; 505 sleepq_t *rsq, *wsq; 506 lwp_t *l; 507 508 tc = &turnstile_tab[TS_HASH(obj)]; 509 510 LIST_FOREACH(ts, &tc->tc_chain, ts_chain) 511 if (ts->ts_obj == obj) 512 break; 513 514 (*pr)("Turnstile chain at %p.\n", tc); 515 if (ts == NULL) { 516 (*pr)("=> No active turnstile for this lock.\n"); 517 return; 518 } 519 520 rsq = &ts->ts_sleepq[TS_READER_Q]; 521 wsq = &ts->ts_sleepq[TS_WRITER_Q]; 522 523 (*pr)("=> Turnstile at %p (wrq=%p, rdq=%p).\n", ts, rsq, wsq); 524 525 (*pr)("=> %d waiting readers:", TS_WAITERS(ts, TS_READER_Q)); 526 TAILQ_FOREACH(l, rsq, l_sleepchain) { 527 (*pr)(" %p", l); 528 } 529 (*pr)("\n"); 530 531 (*pr)("=> %d waiting writers:", TS_WAITERS(ts, TS_WRITER_Q)); 532 TAILQ_FOREACH(l, wsq, l_sleepchain) { 533 (*pr)(" %p", l); 534 } 535 (*pr)("\n"); 536 } 537 #endif /* LOCKDEBUG */ 538