1 /* $NetBSD: kern_condvar.c,v 1.52 2020/05/11 03:59:33 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2006, 2007, 2008, 2019, 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Kernel condition variable implementation. 34 */ 35 36 #include <sys/cdefs.h> 37 __KERNEL_RCSID(0, "$NetBSD: kern_condvar.c,v 1.52 2020/05/11 03:59:33 riastradh Exp $"); 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/lwp.h> 42 #include <sys/condvar.h> 43 #include <sys/sleepq.h> 44 #include <sys/lockdebug.h> 45 #include <sys/cpu.h> 46 #include <sys/kernel.h> 47 48 /* 49 * Accessors for the private contents of the kcondvar_t data type. 50 * 51 * cv_opaque[0] sleepq_t 52 * cv_opaque[1] description for ps(1) 53 * 54 * cv_opaque[0] is protected by the interlock passed to cv_wait() (enqueue 55 * only), and the sleep queue lock acquired with sleepq_hashlock() (enqueue 56 * and dequeue). 57 * 58 * cv_opaque[1] (the wmesg) is static and does not change throughout the life 59 * of the CV. 60 */ 61 #define CV_SLEEPQ(cv) ((sleepq_t *)(cv)->cv_opaque) 62 #define CV_WMESG(cv) ((const char *)(cv)->cv_opaque[1]) 63 #define CV_SET_WMESG(cv, v) (cv)->cv_opaque[1] = __UNCONST(v) 64 65 #define CV_DEBUG_P(cv) (CV_WMESG(cv) != nodebug) 66 #define CV_RA ((uintptr_t)__builtin_return_address(0)) 67 68 static void cv_unsleep(lwp_t *, bool); 69 static inline void cv_wakeup_one(kcondvar_t *); 70 static inline void cv_wakeup_all(kcondvar_t *); 71 72 syncobj_t cv_syncobj = { 73 .sobj_flag = SOBJ_SLEEPQ_SORTED, 74 .sobj_unsleep = cv_unsleep, 75 .sobj_changepri = sleepq_changepri, 76 .sobj_lendpri = sleepq_lendpri, 77 .sobj_owner = syncobj_noowner, 78 }; 79 80 static const char deadcv[] = "deadcv"; 81 82 /* 83 * cv_init: 84 * 85 * Initialize a condition variable for use. 86 */ 87 void 88 cv_init(kcondvar_t *cv, const char *wmesg) 89 { 90 91 KASSERT(wmesg != NULL); 92 CV_SET_WMESG(cv, wmesg); 93 sleepq_init(CV_SLEEPQ(cv)); 94 } 95 96 /* 97 * cv_destroy: 98 * 99 * Tear down a condition variable. 100 */ 101 void 102 cv_destroy(kcondvar_t *cv) 103 { 104 105 #ifdef DIAGNOSTIC 106 KASSERT(cv_is_valid(cv)); 107 KASSERT(!cv_has_waiters(cv)); 108 CV_SET_WMESG(cv, deadcv); 109 #endif 110 } 111 112 /* 113 * cv_enter: 114 * 115 * Look up and lock the sleep queue corresponding to the given 116 * condition variable, and increment the number of waiters. 117 */ 118 static inline void 119 cv_enter(kcondvar_t *cv, kmutex_t *mtx, lwp_t *l, bool catch_p) 120 { 121 sleepq_t *sq; 122 kmutex_t *mp; 123 124 KASSERT(cv_is_valid(cv)); 125 KASSERT(!cpu_intr_p()); 126 KASSERT((l->l_pflag & LP_INTR) == 0 || panicstr != NULL); 127 128 l->l_kpriority = true; 129 mp = sleepq_hashlock(cv); 130 sq = CV_SLEEPQ(cv); 131 sleepq_enter(sq, l, mp); 132 sleepq_enqueue(sq, cv, CV_WMESG(cv), &cv_syncobj, catch_p); 133 mutex_exit(mtx); 134 KASSERT(cv_has_waiters(cv)); 135 } 136 137 /* 138 * cv_unsleep: 139 * 140 * Remove an LWP from the condition variable and sleep queue. This 141 * is called when the LWP has not been awoken normally but instead 142 * interrupted: for example, when a signal is received. Must be 143 * called with the LWP locked. Will unlock if "unlock" is true. 144 */ 145 static void 146 cv_unsleep(lwp_t *l, bool unlock) 147 { 148 kcondvar_t *cv __diagused; 149 150 cv = (kcondvar_t *)(uintptr_t)l->l_wchan; 151 152 KASSERT(l->l_wchan == (wchan_t)cv); 153 KASSERT(l->l_sleepq == CV_SLEEPQ(cv)); 154 KASSERT(cv_is_valid(cv)); 155 KASSERT(cv_has_waiters(cv)); 156 157 sleepq_unsleep(l, unlock); 158 } 159 160 /* 161 * cv_wait: 162 * 163 * Wait non-interruptably on a condition variable until awoken. 164 */ 165 void 166 cv_wait(kcondvar_t *cv, kmutex_t *mtx) 167 { 168 lwp_t *l = curlwp; 169 170 KASSERT(mutex_owned(mtx)); 171 172 cv_enter(cv, mtx, l, false); 173 (void)sleepq_block(0, false); 174 mutex_enter(mtx); 175 } 176 177 /* 178 * cv_wait_sig: 179 * 180 * Wait on a condition variable until a awoken or a signal is received. 181 * Will also return early if the process is exiting. Returns zero if 182 * awoken normally, ERESTART if a signal was received and the system 183 * call is restartable, or EINTR otherwise. 184 */ 185 int 186 cv_wait_sig(kcondvar_t *cv, kmutex_t *mtx) 187 { 188 lwp_t *l = curlwp; 189 int error; 190 191 KASSERT(mutex_owned(mtx)); 192 193 cv_enter(cv, mtx, l, true); 194 error = sleepq_block(0, true); 195 mutex_enter(mtx); 196 return error; 197 } 198 199 /* 200 * cv_timedwait: 201 * 202 * Wait on a condition variable until awoken or the specified timeout 203 * expires. Returns zero if awoken normally or EWOULDBLOCK if the 204 * timeout expired. 205 * 206 * timo is a timeout in ticks. timo = 0 specifies an infinite timeout. 207 */ 208 int 209 cv_timedwait(kcondvar_t *cv, kmutex_t *mtx, int timo) 210 { 211 lwp_t *l = curlwp; 212 int error; 213 214 KASSERT(mutex_owned(mtx)); 215 216 cv_enter(cv, mtx, l, false); 217 error = sleepq_block(timo, false); 218 mutex_enter(mtx); 219 return error; 220 } 221 222 /* 223 * cv_timedwait_sig: 224 * 225 * Wait on a condition variable until a timeout expires, awoken or a 226 * signal is received. Will also return early if the process is 227 * exiting. Returns zero if awoken normally, EWOULDBLOCK if the 228 * timeout expires, ERESTART if a signal was received and the system 229 * call is restartable, or EINTR otherwise. 230 * 231 * timo is a timeout in ticks. timo = 0 specifies an infinite timeout. 232 */ 233 int 234 cv_timedwait_sig(kcondvar_t *cv, kmutex_t *mtx, int timo) 235 { 236 lwp_t *l = curlwp; 237 int error; 238 239 KASSERT(mutex_owned(mtx)); 240 241 cv_enter(cv, mtx, l, true); 242 error = sleepq_block(timo, true); 243 mutex_enter(mtx); 244 return error; 245 } 246 247 /* 248 * Given a number of seconds, sec, and 2^64ths of a second, frac, we 249 * want a number of ticks for a timeout: 250 * 251 * timo = hz*(sec + frac/2^64) 252 * = hz*sec + hz*frac/2^64 253 * = hz*sec + hz*(frachi*2^32 + fraclo)/2^64 254 * = hz*sec + hz*frachi/2^32 + hz*fraclo/2^64, 255 * 256 * where frachi is the high 32 bits of frac and fraclo is the 257 * low 32 bits. 258 * 259 * We assume hz < INT_MAX/2 < UINT32_MAX, so 260 * 261 * hz*fraclo/2^64 < fraclo*2^32/2^64 <= 1, 262 * 263 * since fraclo < 2^32. 264 * 265 * We clamp the result at INT_MAX/2 for a timeout in ticks, since we 266 * can't represent timeouts higher than INT_MAX in cv_timedwait, and 267 * spurious wakeup is OK. Moreover, we don't want to wrap around, 268 * because we compute end - start in ticks in order to compute the 269 * remaining timeout, and that difference cannot wrap around, so we use 270 * a timeout less than INT_MAX. Using INT_MAX/2 provides plenty of 271 * margin for paranoia and will exceed most waits in practice by far. 272 */ 273 static unsigned 274 bintime2timo(const struct bintime *bt) 275 { 276 277 KASSERT(hz < INT_MAX/2); 278 CTASSERT(INT_MAX/2 < UINT32_MAX); 279 if (bt->sec > ((INT_MAX/2)/hz)) 280 return INT_MAX/2; 281 if ((hz*(bt->frac >> 32) >> 32) > (INT_MAX/2 - hz*bt->sec)) 282 return INT_MAX/2; 283 284 return hz*bt->sec + (hz*(bt->frac >> 32) >> 32); 285 } 286 287 /* 288 * timo is in units of ticks. We want units of seconds and 2^64ths of 289 * a second. We know hz = 1 sec/tick, and 2^64 = 1 sec/(2^64th of a 290 * second), from which we can conclude 2^64 / hz = 1 (2^64th of a 291 * second)/tick. So for the fractional part, we compute 292 * 293 * frac = rem * 2^64 / hz 294 * = ((rem * 2^32) / hz) * 2^32 295 * 296 * Using truncating integer division instead of real division will 297 * leave us with only about 32 bits of precision, which means about 298 * 1/4-nanosecond resolution, which is good enough for our purposes. 299 */ 300 static struct bintime 301 timo2bintime(unsigned timo) 302 { 303 304 return (struct bintime) { 305 .sec = timo / hz, 306 .frac = (((uint64_t)(timo % hz) << 32)/hz << 32), 307 }; 308 } 309 310 /* 311 * cv_timedwaitbt: 312 * 313 * Wait on a condition variable until awoken or the specified 314 * timeout expires. Returns zero if awoken normally or 315 * EWOULDBLOCK if the timeout expires. 316 * 317 * On entry, bt is a timeout in bintime. cv_timedwaitbt subtracts 318 * the time slept, so on exit, bt is the time remaining after 319 * sleeping, possibly negative if the complete time has elapsed. 320 * No infinite timeout; use cv_wait_sig instead. 321 * 322 * epsilon is a requested maximum error in timeout (excluding 323 * spurious wakeups). Currently not used, will be used in the 324 * future to choose between low- and high-resolution timers. 325 * Actual wakeup time will be somewhere in [t, t + max(e, r) + s) 326 * where r is the finest resolution of clock available and s is 327 * scheduling delays for scheduler overhead and competing threads. 328 * Time is measured by the interrupt source implementing the 329 * timeout, not by another timecounter. 330 */ 331 int 332 cv_timedwaitbt(kcondvar_t *cv, kmutex_t *mtx, struct bintime *bt, 333 const struct bintime *epsilon __diagused) 334 { 335 struct bintime slept; 336 unsigned start, end; 337 int timo; 338 int error; 339 340 KASSERTMSG(bt->sec >= 0, "negative timeout"); 341 KASSERTMSG(epsilon != NULL, "specify maximum requested delay"); 342 343 /* If there's nothing left to wait, time out. */ 344 if (bt->sec == 0 && bt->frac == 0) 345 return EWOULDBLOCK; 346 347 /* Convert to ticks, but clamp to be >=1. */ 348 timo = bintime2timo(bt); 349 KASSERTMSG(timo >= 0, "negative ticks: %d", timo); 350 if (timo == 0) 351 timo = 1; 352 353 /* 354 * getticks() is technically int, but nothing special 355 * happens instead of overflow, so we assume two's-complement 356 * wraparound and just treat it as unsigned. 357 */ 358 start = getticks(); 359 error = cv_timedwait(cv, mtx, timo); 360 end = getticks(); 361 362 /* 363 * Set it to the time left, or zero, whichever is larger. We 364 * do not fail with EWOULDBLOCK here because this may have been 365 * an explicit wakeup, so the caller needs to check before they 366 * give up or else cv_signal would be lost. 367 */ 368 slept = timo2bintime(end - start); 369 if (bintimecmp(bt, &slept, <=)) { 370 bt->sec = 0; 371 bt->frac = 0; 372 } else { 373 /* bt := bt - slept */ 374 bintime_sub(bt, &slept); 375 } 376 377 return error; 378 } 379 380 /* 381 * cv_timedwaitbt_sig: 382 * 383 * Wait on a condition variable until awoken, the specified 384 * timeout expires, or interrupted by a signal. Returns zero if 385 * awoken normally, EWOULDBLOCK if the timeout expires, or 386 * EINTR/ERESTART if interrupted by a signal. 387 * 388 * On entry, bt is a timeout in bintime. cv_timedwaitbt_sig 389 * subtracts the time slept, so on exit, bt is the time remaining 390 * after sleeping. No infinite timeout; use cv_wait instead. 391 * 392 * epsilon is a requested maximum error in timeout (excluding 393 * spurious wakeups). Currently not used, will be used in the 394 * future to choose between low- and high-resolution timers. 395 */ 396 int 397 cv_timedwaitbt_sig(kcondvar_t *cv, kmutex_t *mtx, struct bintime *bt, 398 const struct bintime *epsilon __diagused) 399 { 400 struct bintime slept; 401 unsigned start, end; 402 int timo; 403 int error; 404 405 KASSERTMSG(bt->sec >= 0, "negative timeout"); 406 KASSERTMSG(epsilon != NULL, "specify maximum requested delay"); 407 408 /* If there's nothing left to wait, time out. */ 409 if (bt->sec == 0 && bt->frac == 0) 410 return EWOULDBLOCK; 411 412 /* Convert to ticks, but clamp to be >=1. */ 413 timo = bintime2timo(bt); 414 KASSERTMSG(timo >= 0, "negative ticks: %d", timo); 415 if (timo == 0) 416 timo = 1; 417 418 /* 419 * getticks() is technically int, but nothing special 420 * happens instead of overflow, so we assume two's-complement 421 * wraparound and just treat it as unsigned. 422 */ 423 start = getticks(); 424 error = cv_timedwait_sig(cv, mtx, timo); 425 end = getticks(); 426 427 /* 428 * Set it to the time left, or zero, whichever is larger. We 429 * do not fail with EWOULDBLOCK here because this may have been 430 * an explicit wakeup, so the caller needs to check before they 431 * give up or else cv_signal would be lost. 432 */ 433 slept = timo2bintime(end - start); 434 if (bintimecmp(bt, &slept, <=)) { 435 bt->sec = 0; 436 bt->frac = 0; 437 } else { 438 /* bt := bt - slept */ 439 bintime_sub(bt, &slept); 440 } 441 442 return error; 443 } 444 445 /* 446 * cv_signal: 447 * 448 * Wake the highest priority LWP waiting on a condition variable. 449 * Must be called with the interlocking mutex held. 450 */ 451 void 452 cv_signal(kcondvar_t *cv) 453 { 454 455 KASSERT(cv_is_valid(cv)); 456 457 if (__predict_false(!LIST_EMPTY(CV_SLEEPQ(cv)))) 458 cv_wakeup_one(cv); 459 } 460 461 /* 462 * cv_wakeup_one: 463 * 464 * Slow path for cv_signal(). Deliberately marked __noinline to 465 * prevent the compiler pulling it in to cv_signal(), which adds 466 * extra prologue and epilogue code. 467 */ 468 static __noinline void 469 cv_wakeup_one(kcondvar_t *cv) 470 { 471 sleepq_t *sq; 472 kmutex_t *mp; 473 lwp_t *l; 474 475 /* 476 * Keep waking LWPs until a non-interruptable waiter is found. An 477 * interruptable waiter could fail to do something useful with the 478 * wakeup due to an error return from cv_[timed]wait_sig(), and the 479 * caller of cv_signal() may not expect such a scenario. 480 * 481 * This isn't a problem for non-interruptable waits (untimed and 482 * timed), because if such a waiter is woken here it will not return 483 * an error. 484 */ 485 mp = sleepq_hashlock(cv); 486 sq = CV_SLEEPQ(cv); 487 while ((l = LIST_FIRST(sq)) != NULL) { 488 KASSERT(l->l_sleepq == sq); 489 KASSERT(l->l_mutex == mp); 490 KASSERT(l->l_wchan == cv); 491 if ((l->l_flag & LW_SINTR) == 0) { 492 sleepq_remove(sq, l); 493 break; 494 } else 495 sleepq_remove(sq, l); 496 } 497 mutex_spin_exit(mp); 498 } 499 500 /* 501 * cv_broadcast: 502 * 503 * Wake all LWPs waiting on a condition variable. Must be called 504 * with the interlocking mutex held. 505 */ 506 void 507 cv_broadcast(kcondvar_t *cv) 508 { 509 510 KASSERT(cv_is_valid(cv)); 511 512 if (__predict_false(!LIST_EMPTY(CV_SLEEPQ(cv)))) 513 cv_wakeup_all(cv); 514 } 515 516 /* 517 * cv_wakeup_all: 518 * 519 * Slow path for cv_broadcast(). Deliberately marked __noinline to 520 * prevent the compiler pulling it in to cv_broadcast(), which adds 521 * extra prologue and epilogue code. 522 */ 523 static __noinline void 524 cv_wakeup_all(kcondvar_t *cv) 525 { 526 sleepq_t *sq; 527 kmutex_t *mp; 528 lwp_t *l; 529 530 mp = sleepq_hashlock(cv); 531 sq = CV_SLEEPQ(cv); 532 while ((l = LIST_FIRST(sq)) != NULL) { 533 KASSERT(l->l_sleepq == sq); 534 KASSERT(l->l_mutex == mp); 535 KASSERT(l->l_wchan == cv); 536 sleepq_remove(sq, l); 537 } 538 mutex_spin_exit(mp); 539 } 540 541 /* 542 * cv_has_waiters: 543 * 544 * For diagnostic assertions: return non-zero if a condition 545 * variable has waiters. 546 */ 547 bool 548 cv_has_waiters(kcondvar_t *cv) 549 { 550 551 return !LIST_EMPTY(CV_SLEEPQ(cv)); 552 } 553 554 /* 555 * cv_is_valid: 556 * 557 * For diagnostic assertions: return non-zero if a condition 558 * variable appears to be valid. No locks need be held. 559 */ 560 bool 561 cv_is_valid(kcondvar_t *cv) 562 { 563 564 return CV_WMESG(cv) != deadcv && CV_WMESG(cv) != NULL; 565 } 566