1 /* $NetBSD: kern_condvar.c,v 1.41 2018/01/30 07:52:22 ozaki-r Exp $ */ 2 3 /*- 4 * Copyright (c) 2006, 2007, 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Kernel condition variable implementation. 34 */ 35 36 #include <sys/cdefs.h> 37 __KERNEL_RCSID(0, "$NetBSD: kern_condvar.c,v 1.41 2018/01/30 07:52:22 ozaki-r Exp $"); 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/lwp.h> 42 #include <sys/condvar.h> 43 #include <sys/sleepq.h> 44 #include <sys/lockdebug.h> 45 #include <sys/cpu.h> 46 #include <sys/kernel.h> 47 48 /* 49 * Accessors for the private contents of the kcondvar_t data type. 50 * 51 * cv_opaque[0] sleepq... 52 * cv_opaque[1] ...pointers 53 * cv_opaque[2] description for ps(1) 54 * 55 * cv_opaque[0..1] is protected by the interlock passed to cv_wait() (enqueue 56 * only), and the sleep queue lock acquired with sleeptab_lookup() (enqueue 57 * and dequeue). 58 * 59 * cv_opaque[2] (the wmesg) is static and does not change throughout the life 60 * of the CV. 61 */ 62 #define CV_SLEEPQ(cv) ((sleepq_t *)(cv)->cv_opaque) 63 #define CV_WMESG(cv) ((const char *)(cv)->cv_opaque[2]) 64 #define CV_SET_WMESG(cv, v) (cv)->cv_opaque[2] = __UNCONST(v) 65 66 #define CV_DEBUG_P(cv) (CV_WMESG(cv) != nodebug) 67 #define CV_RA ((uintptr_t)__builtin_return_address(0)) 68 69 static void cv_unsleep(lwp_t *, bool); 70 static inline void cv_wakeup_one(kcondvar_t *); 71 static inline void cv_wakeup_all(kcondvar_t *); 72 73 static syncobj_t cv_syncobj = { 74 .sobj_flag = SOBJ_SLEEPQ_SORTED, 75 .sobj_unsleep = cv_unsleep, 76 .sobj_changepri = sleepq_changepri, 77 .sobj_lendpri = sleepq_lendpri, 78 .sobj_owner = syncobj_noowner, 79 }; 80 81 lockops_t cv_lockops = { 82 .lo_name = "Condition variable", 83 .lo_type = LOCKOPS_CV, 84 .lo_dump = NULL, 85 }; 86 87 static const char deadcv[] = "deadcv"; 88 #ifdef LOCKDEBUG 89 static const char nodebug[] = "nodebug"; 90 91 #define CV_LOCKDEBUG_HANDOFF(l, cv) cv_lockdebug_handoff(l, cv) 92 #define CV_LOCKDEBUG_PROCESS(l, cv) cv_lockdebug_process(l, cv) 93 94 static inline void 95 cv_lockdebug_handoff(lwp_t *l, kcondvar_t *cv) 96 { 97 98 if (CV_DEBUG_P(cv)) 99 l->l_flag |= LW_CVLOCKDEBUG; 100 } 101 102 static inline void 103 cv_lockdebug_process(lwp_t *l, kcondvar_t *cv) 104 { 105 106 if ((l->l_flag & LW_CVLOCKDEBUG) == 0) 107 return; 108 109 l->l_flag &= ~LW_CVLOCKDEBUG; 110 LOCKDEBUG_UNLOCKED(true, cv, CV_RA, 0); 111 } 112 #else 113 #define CV_LOCKDEBUG_HANDOFF(l, cv) __nothing 114 #define CV_LOCKDEBUG_PROCESS(l, cv) __nothing 115 #endif 116 117 /* 118 * cv_init: 119 * 120 * Initialize a condition variable for use. 121 */ 122 void 123 cv_init(kcondvar_t *cv, const char *wmesg) 124 { 125 #ifdef LOCKDEBUG 126 bool dodebug; 127 128 dodebug = LOCKDEBUG_ALLOC(cv, &cv_lockops, 129 (uintptr_t)__builtin_return_address(0)); 130 if (!dodebug) { 131 /* XXX This will break vfs_lockf. */ 132 wmesg = nodebug; 133 } 134 #endif 135 KASSERT(wmesg != NULL); 136 CV_SET_WMESG(cv, wmesg); 137 sleepq_init(CV_SLEEPQ(cv)); 138 } 139 140 /* 141 * cv_destroy: 142 * 143 * Tear down a condition variable. 144 */ 145 void 146 cv_destroy(kcondvar_t *cv) 147 { 148 149 LOCKDEBUG_FREE(CV_DEBUG_P(cv), cv); 150 #ifdef DIAGNOSTIC 151 KASSERT(cv_is_valid(cv)); 152 CV_SET_WMESG(cv, deadcv); 153 #endif 154 } 155 156 /* 157 * cv_enter: 158 * 159 * Look up and lock the sleep queue corresponding to the given 160 * condition variable, and increment the number of waiters. 161 */ 162 static inline void 163 cv_enter(kcondvar_t *cv, kmutex_t *mtx, lwp_t *l) 164 { 165 sleepq_t *sq; 166 kmutex_t *mp; 167 168 KASSERT(cv_is_valid(cv)); 169 KASSERT(!cpu_intr_p()); 170 KASSERT((l->l_pflag & LP_INTR) == 0 || panicstr != NULL); 171 172 LOCKDEBUG_LOCKED(CV_DEBUG_P(cv), cv, mtx, CV_RA, 0); 173 174 l->l_kpriority = true; 175 mp = sleepq_hashlock(cv); 176 sq = CV_SLEEPQ(cv); 177 sleepq_enter(sq, l, mp); 178 sleepq_enqueue(sq, cv, CV_WMESG(cv), &cv_syncobj); 179 mutex_exit(mtx); 180 KASSERT(cv_has_waiters(cv)); 181 } 182 183 /* 184 * cv_exit: 185 * 186 * After resuming execution, check to see if we have been restarted 187 * as a result of cv_signal(). If we have, but cannot take the 188 * wakeup (because of eg a pending Unix signal or timeout) then try 189 * to ensure that another LWP sees it. This is necessary because 190 * there may be multiple waiters, and at least one should take the 191 * wakeup if possible. 192 */ 193 static inline int 194 cv_exit(kcondvar_t *cv, kmutex_t *mtx, lwp_t *l, const int error) 195 { 196 197 mutex_enter(mtx); 198 if (__predict_false(error != 0)) 199 cv_signal(cv); 200 201 LOCKDEBUG_UNLOCKED(CV_DEBUG_P(cv), cv, CV_RA, 0); 202 KASSERT(cv_is_valid(cv)); 203 204 return error; 205 } 206 207 /* 208 * cv_unsleep: 209 * 210 * Remove an LWP from the condition variable and sleep queue. This 211 * is called when the LWP has not been awoken normally but instead 212 * interrupted: for example, when a signal is received. Must be 213 * called with the LWP locked, and must return it unlocked. 214 */ 215 static void 216 cv_unsleep(lwp_t *l, bool cleanup) 217 { 218 kcondvar_t *cv __diagused; 219 220 cv = (kcondvar_t *)(uintptr_t)l->l_wchan; 221 222 KASSERT(l->l_wchan == (wchan_t)cv); 223 KASSERT(l->l_sleepq == CV_SLEEPQ(cv)); 224 KASSERT(cv_is_valid(cv)); 225 KASSERT(cv_has_waiters(cv)); 226 227 sleepq_unsleep(l, cleanup); 228 } 229 230 /* 231 * cv_wait: 232 * 233 * Wait non-interruptably on a condition variable until awoken. 234 */ 235 void 236 cv_wait(kcondvar_t *cv, kmutex_t *mtx) 237 { 238 lwp_t *l = curlwp; 239 240 KASSERT(mutex_owned(mtx)); 241 242 cv_enter(cv, mtx, l); 243 244 /* 245 * We can't use cv_exit() here since the cv might be destroyed before 246 * this thread gets a chance to run. Instead, hand off the lockdebug 247 * responsibility to the thread that wakes us up. 248 */ 249 250 CV_LOCKDEBUG_HANDOFF(l, cv); 251 (void)sleepq_block(0, false); 252 mutex_enter(mtx); 253 } 254 255 /* 256 * cv_wait_sig: 257 * 258 * Wait on a condition variable until a awoken or a signal is received. 259 * Will also return early if the process is exiting. Returns zero if 260 * awoken normally, ERESTART if a signal was received and the system 261 * call is restartable, or EINTR otherwise. 262 */ 263 int 264 cv_wait_sig(kcondvar_t *cv, kmutex_t *mtx) 265 { 266 lwp_t *l = curlwp; 267 int error; 268 269 KASSERT(mutex_owned(mtx)); 270 271 cv_enter(cv, mtx, l); 272 error = sleepq_block(0, true); 273 return cv_exit(cv, mtx, l, error); 274 } 275 276 /* 277 * cv_timedwait: 278 * 279 * Wait on a condition variable until awoken or the specified timeout 280 * expires. Returns zero if awoken normally or EWOULDBLOCK if the 281 * timeout expired. 282 * 283 * timo is a timeout in ticks. timo = 0 specifies an infinite timeout. 284 */ 285 int 286 cv_timedwait(kcondvar_t *cv, kmutex_t *mtx, int timo) 287 { 288 lwp_t *l = curlwp; 289 int error; 290 291 KASSERT(mutex_owned(mtx)); 292 293 cv_enter(cv, mtx, l); 294 error = sleepq_block(timo, false); 295 return cv_exit(cv, mtx, l, error); 296 } 297 298 /* 299 * cv_timedwait_sig: 300 * 301 * Wait on a condition variable until a timeout expires, awoken or a 302 * signal is received. Will also return early if the process is 303 * exiting. Returns zero if awoken normally, EWOULDBLOCK if the 304 * timeout expires, ERESTART if a signal was received and the system 305 * call is restartable, or EINTR otherwise. 306 * 307 * timo is a timeout in ticks. timo = 0 specifies an infinite timeout. 308 */ 309 int 310 cv_timedwait_sig(kcondvar_t *cv, kmutex_t *mtx, int timo) 311 { 312 lwp_t *l = curlwp; 313 int error; 314 315 KASSERT(mutex_owned(mtx)); 316 317 cv_enter(cv, mtx, l); 318 error = sleepq_block(timo, true); 319 return cv_exit(cv, mtx, l, error); 320 } 321 322 /* 323 * Given a number of seconds, sec, and 2^64ths of a second, frac, we 324 * want a number of ticks for a timeout: 325 * 326 * timo = hz*(sec + frac/2^64) 327 * = hz*sec + hz*frac/2^64 328 * = hz*sec + hz*(frachi*2^32 + fraclo)/2^64 329 * = hz*sec + hz*frachi/2^32 + hz*fraclo/2^64, 330 * 331 * where frachi is the high 32 bits of frac and fraclo is the 332 * low 32 bits. 333 * 334 * We assume hz < INT_MAX/2 < UINT32_MAX, so 335 * 336 * hz*fraclo/2^64 < fraclo*2^32/2^64 <= 1, 337 * 338 * since fraclo < 2^32. 339 * 340 * We clamp the result at INT_MAX/2 for a timeout in ticks, since we 341 * can't represent timeouts higher than INT_MAX in cv_timedwait, and 342 * spurious wakeup is OK. Moreover, we don't want to wrap around, 343 * because we compute end - start in ticks in order to compute the 344 * remaining timeout, and that difference cannot wrap around, so we use 345 * a timeout less than INT_MAX. Using INT_MAX/2 provides plenty of 346 * margin for paranoia and will exceed most waits in practice by far. 347 */ 348 static unsigned 349 bintime2timo(const struct bintime *bt) 350 { 351 352 KASSERT(hz < INT_MAX/2); 353 CTASSERT(INT_MAX/2 < UINT32_MAX); 354 if (bt->sec > ((INT_MAX/2)/hz)) 355 return INT_MAX/2; 356 if ((hz*(bt->frac >> 32) >> 32) > (INT_MAX/2 - hz*bt->sec)) 357 return INT_MAX/2; 358 359 return hz*bt->sec + (hz*(bt->frac >> 32) >> 32); 360 } 361 362 /* 363 * timo is in units of ticks. We want units of seconds and 2^64ths of 364 * a second. We know hz = 1 sec/tick, and 2^64 = 1 sec/(2^64th of a 365 * second), from which we can conclude 2^64 / hz = 1 (2^64th of a 366 * second)/tick. So for the fractional part, we compute 367 * 368 * frac = rem * 2^64 / hz 369 * = ((rem * 2^32) / hz) * 2^32 370 * 371 * Using truncating integer division instead of real division will 372 * leave us with only about 32 bits of precision, which means about 373 * 1/4-nanosecond resolution, which is good enough for our purposes. 374 */ 375 static struct bintime 376 timo2bintime(unsigned timo) 377 { 378 379 return (struct bintime) { 380 .sec = timo / hz, 381 .frac = (((uint64_t)(timo % hz) << 32)/hz << 32), 382 }; 383 } 384 385 /* 386 * cv_timedwaitbt: 387 * 388 * Wait on a condition variable until awoken or the specified 389 * timeout expires. Returns zero if awoken normally or 390 * EWOULDBLOCK if the timeout expires. 391 * 392 * On entry, bt is a timeout in bintime. cv_timedwaitbt subtracts 393 * the time slept, so on exit, bt is the time remaining after 394 * sleeping, possibly negative if the complete time has elapsed. 395 * No infinite timeout; use cv_wait_sig instead. 396 * 397 * epsilon is a requested maximum error in timeout (excluding 398 * spurious wakeups). Currently not used, will be used in the 399 * future to choose between low- and high-resolution timers. 400 * Actual wakeup time will be somewhere in [t, t + max(e, r) + s) 401 * where r is the finest resolution of clock available and s is 402 * scheduling delays for scheduler overhead and competing threads. 403 * Time is measured by the interrupt source implementing the 404 * timeout, not by another timecounter. 405 */ 406 int 407 cv_timedwaitbt(kcondvar_t *cv, kmutex_t *mtx, struct bintime *bt, 408 const struct bintime *epsilon __diagused) 409 { 410 struct bintime slept; 411 unsigned start, end; 412 int error; 413 414 KASSERTMSG(bt->sec >= 0, "negative timeout"); 415 KASSERTMSG(epsilon != NULL, "specify maximum requested delay"); 416 417 /* 418 * hardclock_ticks is technically int, but nothing special 419 * happens instead of overflow, so we assume two's-complement 420 * wraparound and just treat it as unsigned. 421 */ 422 start = hardclock_ticks; 423 error = cv_timedwait(cv, mtx, bintime2timo(bt)); 424 end = hardclock_ticks; 425 426 slept = timo2bintime(end - start); 427 /* bt := bt - slept */ 428 bintime_sub(bt, &slept); 429 430 return error; 431 } 432 433 /* 434 * cv_timedwaitbt_sig: 435 * 436 * Wait on a condition variable until awoken, the specified 437 * timeout expires, or interrupted by a signal. Returns zero if 438 * awoken normally, EWOULDBLOCK if the timeout expires, or 439 * EINTR/ERESTART if interrupted by a signal. 440 * 441 * On entry, bt is a timeout in bintime. cv_timedwaitbt_sig 442 * subtracts the time slept, so on exit, bt is the time remaining 443 * after sleeping. No infinite timeout; use cv_wait instead. 444 * 445 * epsilon is a requested maximum error in timeout (excluding 446 * spurious wakeups). Currently not used, will be used in the 447 * future to choose between low- and high-resolution timers. 448 */ 449 int 450 cv_timedwaitbt_sig(kcondvar_t *cv, kmutex_t *mtx, struct bintime *bt, 451 const struct bintime *epsilon __diagused) 452 { 453 struct bintime slept; 454 unsigned start, end; 455 int error; 456 457 KASSERTMSG(bt->sec >= 0, "negative timeout"); 458 KASSERTMSG(epsilon != NULL, "specify maximum requested delay"); 459 460 /* 461 * hardclock_ticks is technically int, but nothing special 462 * happens instead of overflow, so we assume two's-complement 463 * wraparound and just treat it as unsigned. 464 */ 465 start = hardclock_ticks; 466 error = cv_timedwait_sig(cv, mtx, bintime2timo(bt)); 467 end = hardclock_ticks; 468 469 slept = timo2bintime(end - start); 470 /* bt := bt - slept */ 471 bintime_sub(bt, &slept); 472 473 return error; 474 } 475 476 /* 477 * cv_signal: 478 * 479 * Wake the highest priority LWP waiting on a condition variable. 480 * Must be called with the interlocking mutex held. 481 */ 482 void 483 cv_signal(kcondvar_t *cv) 484 { 485 486 /* LOCKDEBUG_WAKEUP(CV_DEBUG_P(cv), cv, CV_RA); */ 487 KASSERT(cv_is_valid(cv)); 488 489 if (__predict_false(!TAILQ_EMPTY(CV_SLEEPQ(cv)))) 490 cv_wakeup_one(cv); 491 } 492 493 static inline void 494 cv_wakeup_one(kcondvar_t *cv) 495 { 496 sleepq_t *sq; 497 kmutex_t *mp; 498 lwp_t *l; 499 500 KASSERT(cv_is_valid(cv)); 501 502 mp = sleepq_hashlock(cv); 503 sq = CV_SLEEPQ(cv); 504 l = TAILQ_FIRST(sq); 505 if (l == NULL) { 506 mutex_spin_exit(mp); 507 return; 508 } 509 KASSERT(l->l_sleepq == sq); 510 KASSERT(l->l_mutex == mp); 511 KASSERT(l->l_wchan == cv); 512 CV_LOCKDEBUG_PROCESS(l, cv); 513 sleepq_remove(sq, l); 514 mutex_spin_exit(mp); 515 516 KASSERT(cv_is_valid(cv)); 517 } 518 519 /* 520 * cv_broadcast: 521 * 522 * Wake all LWPs waiting on a condition variable. Must be called 523 * with the interlocking mutex held. 524 */ 525 void 526 cv_broadcast(kcondvar_t *cv) 527 { 528 529 /* LOCKDEBUG_WAKEUP(CV_DEBUG_P(cv), cv, CV_RA); */ 530 KASSERT(cv_is_valid(cv)); 531 532 if (__predict_false(!TAILQ_EMPTY(CV_SLEEPQ(cv)))) 533 cv_wakeup_all(cv); 534 } 535 536 static inline void 537 cv_wakeup_all(kcondvar_t *cv) 538 { 539 sleepq_t *sq; 540 kmutex_t *mp; 541 lwp_t *l, *next; 542 543 KASSERT(cv_is_valid(cv)); 544 545 mp = sleepq_hashlock(cv); 546 sq = CV_SLEEPQ(cv); 547 for (l = TAILQ_FIRST(sq); l != NULL; l = next) { 548 KASSERT(l->l_sleepq == sq); 549 KASSERT(l->l_mutex == mp); 550 KASSERT(l->l_wchan == cv); 551 next = TAILQ_NEXT(l, l_sleepchain); 552 CV_LOCKDEBUG_PROCESS(l, cv); 553 sleepq_remove(sq, l); 554 } 555 mutex_spin_exit(mp); 556 557 KASSERT(cv_is_valid(cv)); 558 } 559 560 /* 561 * cv_has_waiters: 562 * 563 * For diagnostic assertions: return non-zero if a condition 564 * variable has waiters. 565 */ 566 bool 567 cv_has_waiters(kcondvar_t *cv) 568 { 569 570 return !TAILQ_EMPTY(CV_SLEEPQ(cv)); 571 } 572 573 /* 574 * cv_is_valid: 575 * 576 * For diagnostic assertions: return non-zero if a condition 577 * variable appears to be valid. No locks need be held. 578 */ 579 bool 580 cv_is_valid(kcondvar_t *cv) 581 { 582 583 return CV_WMESG(cv) != deadcv && CV_WMESG(cv) != NULL; 584 } 585