1 /* $NetBSD: kern_turnstile.c,v 1.30 2011/07/27 14:35:34 uebayasi Exp $ */ 2 3 /*- 4 * Copyright (c) 2002, 2006, 2007, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe and Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Turnstiles are described in detail in: 34 * 35 * Solaris Internals: Core Kernel Architecture, Jim Mauro and 36 * Richard McDougall. 37 * 38 * Turnstiles are kept in a hash table. There are likely to be many more 39 * synchronisation objects than there are threads. Since a thread can block 40 * on only one lock at a time, we only need one turnstile per thread, and 41 * so they are allocated at thread creation time. 42 * 43 * When a thread decides it needs to block on a lock, it looks up the 44 * active turnstile for that lock. If no active turnstile exists, then 45 * the process lends its turnstile to the lock. If there is already an 46 * active turnstile for the lock, the thread places its turnstile on a 47 * list of free turnstiles, and references the active one instead. 48 * 49 * The act of looking up the turnstile acquires an interlock on the sleep 50 * queue. If a thread decides it doesn't need to block after all, then this 51 * interlock must be released by explicitly aborting the turnstile 52 * operation. 53 * 54 * When a thread is awakened, it needs to get its turnstile back. If there 55 * are still other threads waiting in the active turnstile, the thread 56 * grabs a free turnstile off the free list. Otherwise, it can take back 57 * the active turnstile from the lock (thus deactivating the turnstile). 58 * 59 * Turnstiles are the place to do priority inheritence. 60 */ 61 62 #include <sys/cdefs.h> 63 __KERNEL_RCSID(0, "$NetBSD: kern_turnstile.c,v 1.30 2011/07/27 14:35:34 uebayasi Exp $"); 64 65 #include <sys/param.h> 66 #include <sys/lockdebug.h> 67 #include <sys/pool.h> 68 #include <sys/proc.h> 69 #include <sys/sleepq.h> 70 #include <sys/systm.h> 71 72 #define TS_HASH_SIZE 64 73 #define TS_HASH_MASK (TS_HASH_SIZE - 1) 74 #define TS_HASH(obj) (((uintptr_t)(obj) >> 3) & TS_HASH_MASK) 75 76 static tschain_t turnstile_tab[TS_HASH_SIZE] __cacheline_aligned; 77 pool_cache_t turnstile_cache __read_mostly; 78 79 static int turnstile_ctor(void *, void *, int); 80 81 extern turnstile_t turnstile0; 82 83 /* 84 * turnstile_init: 85 * 86 * Initialize the turnstile mechanism. 87 */ 88 void 89 turnstile_init(void) 90 { 91 tschain_t *tc; 92 int i; 93 94 for (i = 0; i < TS_HASH_SIZE; i++) { 95 tc = &turnstile_tab[i]; 96 LIST_INIT(&tc->tc_chain); 97 tc->tc_mutex = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); 98 } 99 100 turnstile_cache = pool_cache_init(sizeof(turnstile_t), 0, 0, 0, 101 "tstilepl", NULL, IPL_NONE, turnstile_ctor, NULL, NULL); 102 KASSERT(turnstile_cache != NULL); 103 104 (void)turnstile_ctor(NULL, &turnstile0, 0); 105 } 106 107 /* 108 * turnstile_ctor: 109 * 110 * Constructor for turnstiles. 111 */ 112 static int 113 turnstile_ctor(void *arg, void *obj, int flags) 114 { 115 turnstile_t *ts = obj; 116 117 memset(ts, 0, sizeof(*ts)); 118 sleepq_init(&ts->ts_sleepq[TS_READER_Q]); 119 sleepq_init(&ts->ts_sleepq[TS_WRITER_Q]); 120 return (0); 121 } 122 123 /* 124 * turnstile_remove: 125 * 126 * Remove an LWP from a turnstile sleep queue and wake it. 127 */ 128 static inline void 129 turnstile_remove(turnstile_t *ts, lwp_t *l, int q) 130 { 131 turnstile_t *nts; 132 133 KASSERT(l->l_ts == ts); 134 135 /* 136 * This process is no longer using the active turnstile. 137 * Find an inactive one on the free list to give to it. 138 */ 139 if ((nts = ts->ts_free) != NULL) { 140 KASSERT(TS_ALL_WAITERS(ts) > 1); 141 l->l_ts = nts; 142 ts->ts_free = nts->ts_free; 143 nts->ts_free = NULL; 144 } else { 145 /* 146 * If the free list is empty, this is the last 147 * waiter. 148 */ 149 KASSERT(TS_ALL_WAITERS(ts) == 1); 150 LIST_REMOVE(ts, ts_chain); 151 } 152 153 ts->ts_waiters[q]--; 154 sleepq_remove(&ts->ts_sleepq[q], l); 155 } 156 157 /* 158 * turnstile_lookup: 159 * 160 * Look up the turnstile for the specified lock. This acquires and 161 * holds the turnstile chain lock (sleep queue interlock). 162 */ 163 turnstile_t * 164 turnstile_lookup(wchan_t obj) 165 { 166 turnstile_t *ts; 167 tschain_t *tc; 168 169 tc = &turnstile_tab[TS_HASH(obj)]; 170 mutex_spin_enter(tc->tc_mutex); 171 172 LIST_FOREACH(ts, &tc->tc_chain, ts_chain) 173 if (ts->ts_obj == obj) 174 return (ts); 175 176 /* 177 * No turnstile yet for this lock. No problem, turnstile_block() 178 * handles this by fetching the turnstile from the blocking thread. 179 */ 180 return (NULL); 181 } 182 183 /* 184 * turnstile_exit: 185 * 186 * Abort a turnstile operation. 187 */ 188 void 189 turnstile_exit(wchan_t obj) 190 { 191 tschain_t *tc; 192 193 tc = &turnstile_tab[TS_HASH(obj)]; 194 mutex_spin_exit(tc->tc_mutex); 195 } 196 197 /* 198 * turnstile_block: 199 * 200 * Enter an object into the turnstile chain and prepare the current 201 * LWP for sleep. 202 */ 203 void 204 turnstile_block(turnstile_t *ts, int q, wchan_t obj, syncobj_t *sobj) 205 { 206 lwp_t *l; 207 lwp_t *cur; /* cached curlwp */ 208 lwp_t *owner; 209 turnstile_t *ots; 210 tschain_t *tc; 211 sleepq_t *sq; 212 pri_t prio, obase; 213 214 tc = &turnstile_tab[TS_HASH(obj)]; 215 l = cur = curlwp; 216 217 KASSERT(q == TS_READER_Q || q == TS_WRITER_Q); 218 KASSERT(mutex_owned(tc->tc_mutex)); 219 KASSERT(l != NULL && l->l_ts != NULL); 220 221 if (ts == NULL) { 222 /* 223 * We are the first thread to wait for this object; 224 * lend our turnstile to it. 225 */ 226 ts = l->l_ts; 227 KASSERT(TS_ALL_WAITERS(ts) == 0); 228 KASSERT(TAILQ_EMPTY(&ts->ts_sleepq[TS_READER_Q]) && 229 TAILQ_EMPTY(&ts->ts_sleepq[TS_WRITER_Q])); 230 ts->ts_obj = obj; 231 ts->ts_inheritor = NULL; 232 LIST_INSERT_HEAD(&tc->tc_chain, ts, ts_chain); 233 } else { 234 /* 235 * Object already has a turnstile. Put our turnstile 236 * onto the free list, and reference the existing 237 * turnstile instead. 238 */ 239 ots = l->l_ts; 240 KASSERT(ots->ts_free == NULL); 241 ots->ts_free = ts->ts_free; 242 ts->ts_free = ots; 243 l->l_ts = ts; 244 245 KASSERT(ts->ts_obj == obj); 246 KASSERT(TS_ALL_WAITERS(ts) != 0); 247 KASSERT(!TAILQ_EMPTY(&ts->ts_sleepq[TS_READER_Q]) || 248 !TAILQ_EMPTY(&ts->ts_sleepq[TS_WRITER_Q])); 249 } 250 251 sq = &ts->ts_sleepq[q]; 252 ts->ts_waiters[q]++; 253 sleepq_enter(sq, l, tc->tc_mutex); 254 LOCKDEBUG_BARRIER(tc->tc_mutex, 1); 255 l->l_kpriority = true; 256 obase = l->l_kpribase; 257 if (obase < PRI_KTHREAD) 258 l->l_kpribase = PRI_KTHREAD; 259 sleepq_enqueue(sq, obj, "tstile", sobj); 260 261 /* 262 * Disable preemption across this entire block, as we may drop 263 * scheduler locks (allowing preemption), and would prefer not 264 * to be interrupted while in a state of flux. 265 */ 266 KPREEMPT_DISABLE(l); 267 268 /* 269 * Lend our priority to lwps on the blocking chain. 270 * 271 * NOTE: if you get a panic in this code block, it is likely that 272 * a lock has been destroyed or corrupted while still in use. Try 273 * compiling a kernel with LOCKDEBUG to pinpoint the problem. 274 */ 275 prio = lwp_eprio(l); 276 KASSERT(cur == l); 277 KASSERT(tc->tc_mutex == cur->l_mutex); 278 for (;;) { 279 bool dolock; 280 281 if (l->l_wchan == NULL) 282 break; 283 284 owner = (*l->l_syncobj->sobj_owner)(l->l_wchan); 285 if (owner == NULL) 286 break; 287 288 /* The owner may have changed as we have dropped the tc lock */ 289 if (cur == owner) { 290 /* 291 * we own the lock: stop here, sleepq_block() 292 * should wake up immediatly 293 */ 294 break; 295 } 296 if (l->l_mutex != owner->l_mutex) 297 dolock = true; 298 else 299 dolock = false; 300 if (l == owner || (dolock && !lwp_trylock(owner))) { 301 /* 302 * restart from curlwp. 303 * Note that there may be a livelock here: 304 * the owner may try grabing cur's lock (which is 305 * the tc lock) while we're trying to grab 306 * the owner's lock. 307 */ 308 lwp_unlock(l); 309 l = cur; 310 lwp_lock(l); 311 prio = lwp_eprio(l); 312 continue; 313 } 314 if (prio <= lwp_eprio(owner)) { 315 if (dolock) 316 lwp_unlock(owner); 317 break; 318 } 319 ts = l->l_ts; 320 KASSERT(ts->ts_inheritor == owner || ts->ts_inheritor == NULL); 321 if (ts->ts_inheritor == NULL) { 322 ts->ts_inheritor = owner; 323 ts->ts_eprio = prio; 324 SLIST_INSERT_HEAD(&owner->l_pi_lenders, ts, ts_pichain); 325 lwp_lendpri(owner, prio); 326 } else if (prio > ts->ts_eprio) { 327 ts->ts_eprio = prio; 328 lwp_lendpri(owner, prio); 329 } 330 if (dolock) 331 lwp_unlock(l); 332 l = owner; 333 } 334 LOCKDEBUG_BARRIER(l->l_mutex, 1); 335 if (cur->l_mutex != l->l_mutex) { 336 lwp_unlock(l); 337 lwp_lock(cur); 338 } 339 LOCKDEBUG_BARRIER(cur->l_mutex, 1); 340 341 sleepq_block(0, false); 342 cur->l_kpribase = obase; 343 KPREEMPT_ENABLE(cur); 344 } 345 346 /* 347 * turnstile_wakeup: 348 * 349 * Wake up the specified number of threads that are blocked 350 * in a turnstile. 351 */ 352 void 353 turnstile_wakeup(turnstile_t *ts, int q, int count, lwp_t *nl) 354 { 355 sleepq_t *sq; 356 tschain_t *tc; 357 lwp_t *l; 358 359 tc = &turnstile_tab[TS_HASH(ts->ts_obj)]; 360 sq = &ts->ts_sleepq[q]; 361 362 KASSERT(q == TS_READER_Q || q == TS_WRITER_Q); 363 KASSERT(count > 0 && count <= TS_WAITERS(ts, q)); 364 KASSERT(mutex_owned(tc->tc_mutex)); 365 KASSERT(ts->ts_inheritor == curlwp || ts->ts_inheritor == NULL); 366 367 /* 368 * restore inherited priority if necessary. 369 */ 370 371 if (ts->ts_inheritor != NULL) { 372 turnstile_t *iter; 373 turnstile_t *next; 374 turnstile_t *prev = NULL; 375 pri_t prio; 376 bool dolock; 377 378 ts->ts_inheritor = NULL; 379 l = curlwp; 380 381 dolock = l->l_mutex == l->l_cpu->ci_schedstate.spc_lwplock; 382 if (dolock) { 383 lwp_lock(l); 384 } 385 386 /* 387 * the following loop does two things. 388 * 389 * - remove ts from the list. 390 * 391 * - from the rest of the list, find the highest priority. 392 */ 393 394 prio = -1; 395 KASSERT(!SLIST_EMPTY(&l->l_pi_lenders)); 396 for (iter = SLIST_FIRST(&l->l_pi_lenders); 397 iter != NULL; iter = next) { 398 KASSERT(lwp_eprio(l) >= ts->ts_eprio); 399 next = SLIST_NEXT(iter, ts_pichain); 400 if (iter == ts) { 401 if (prev == NULL) { 402 SLIST_REMOVE_HEAD(&l->l_pi_lenders, 403 ts_pichain); 404 } else { 405 SLIST_REMOVE_AFTER(prev, ts_pichain); 406 } 407 } else if (prio < iter->ts_eprio) { 408 prio = iter->ts_eprio; 409 } 410 prev = iter; 411 } 412 413 lwp_lendpri(l, prio); 414 415 if (dolock) { 416 lwp_unlock(l); 417 } 418 } 419 420 if (nl != NULL) { 421 #if defined(DEBUG) || defined(LOCKDEBUG) 422 TAILQ_FOREACH(l, sq, l_sleepchain) { 423 if (l == nl) 424 break; 425 } 426 if (l == NULL) 427 panic("turnstile_wakeup: nl not on sleepq"); 428 #endif 429 turnstile_remove(ts, nl, q); 430 } else { 431 while (count-- > 0) { 432 l = TAILQ_FIRST(sq); 433 KASSERT(l != NULL); 434 turnstile_remove(ts, l, q); 435 } 436 } 437 mutex_spin_exit(tc->tc_mutex); 438 } 439 440 /* 441 * turnstile_unsleep: 442 * 443 * Remove an LWP from the turnstile. This is called when the LWP has 444 * not been awoken normally but instead interrupted: for example, if it 445 * has received a signal. It's not a valid action for turnstiles, 446 * since LWPs blocking on a turnstile are not interruptable. 447 */ 448 void 449 turnstile_unsleep(lwp_t *l, bool cleanup) 450 { 451 452 lwp_unlock(l); 453 panic("turnstile_unsleep"); 454 } 455 456 /* 457 * turnstile_changepri: 458 * 459 * Adjust the priority of an LWP residing on a turnstile. 460 */ 461 void 462 turnstile_changepri(lwp_t *l, pri_t pri) 463 { 464 465 /* XXX priority inheritance */ 466 sleepq_changepri(l, pri); 467 } 468 469 #if defined(LOCKDEBUG) 470 /* 471 * turnstile_print: 472 * 473 * Given the address of a lock object, print the contents of a 474 * turnstile. 475 */ 476 void 477 turnstile_print(volatile void *obj, void (*pr)(const char *, ...)) 478 { 479 turnstile_t *ts; 480 tschain_t *tc; 481 sleepq_t *rsq, *wsq; 482 lwp_t *l; 483 484 tc = &turnstile_tab[TS_HASH(obj)]; 485 486 LIST_FOREACH(ts, &tc->tc_chain, ts_chain) 487 if (ts->ts_obj == obj) 488 break; 489 490 (*pr)("Turnstile chain at %p.\n", tc); 491 if (ts == NULL) { 492 (*pr)("=> No active turnstile for this lock.\n"); 493 return; 494 } 495 496 rsq = &ts->ts_sleepq[TS_READER_Q]; 497 wsq = &ts->ts_sleepq[TS_WRITER_Q]; 498 499 (*pr)("=> Turnstile at %p (wrq=%p, rdq=%p).\n", ts, rsq, wsq); 500 501 (*pr)("=> %d waiting readers:", TS_WAITERS(ts, TS_READER_Q)); 502 TAILQ_FOREACH(l, rsq, l_sleepchain) { 503 (*pr)(" %p", l); 504 } 505 (*pr)("\n"); 506 507 (*pr)("=> %d waiting writers:", TS_WAITERS(ts, TS_WRITER_Q)); 508 TAILQ_FOREACH(l, wsq, l_sleepchain) { 509 (*pr)(" %p", l); 510 } 511 (*pr)("\n"); 512 } 513 #endif /* LOCKDEBUG */ 514