1 /* $NetBSD: kern_condvar.c,v 1.63 2023/11/02 10:31:55 martin Exp $ */ 2 3 /*- 4 * Copyright (c) 2006, 2007, 2008, 2019, 2020, 2023 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Kernel condition variable implementation. 35 */ 36 37 #include <sys/cdefs.h> 38 __KERNEL_RCSID(0, "$NetBSD: kern_condvar.c,v 1.63 2023/11/02 10:31:55 martin Exp $"); 39 40 #include <sys/param.h> 41 42 #include <sys/condvar.h> 43 #include <sys/cpu.h> 44 #include <sys/kernel.h> 45 #include <sys/lockdebug.h> 46 #include <sys/lwp.h> 47 #include <sys/sleepq.h> 48 #include <sys/syncobj.h> 49 #include <sys/systm.h> 50 51 /* 52 * Accessors for the private contents of the kcondvar_t data type. 53 * 54 * cv_opaque[0] sleepq_t 55 * cv_opaque[1] description for ps(1) 56 * 57 * cv_opaque[0] is protected by the interlock passed to cv_wait() (enqueue 58 * only), and the sleep queue lock acquired with sleepq_hashlock() (enqueue 59 * and dequeue). 60 * 61 * cv_opaque[1] (the wmesg) is static and does not change throughout the life 62 * of the CV. 63 */ 64 #define CV_SLEEPQ(cv) ((sleepq_t *)(cv)->cv_opaque) 65 #define CV_WMESG(cv) ((const char *)(cv)->cv_opaque[1]) 66 #define CV_SET_WMESG(cv, v) (cv)->cv_opaque[1] = __UNCONST(v) 67 68 #define CV_DEBUG_P(cv) (CV_WMESG(cv) != nodebug) 69 #define CV_RA ((uintptr_t)__builtin_return_address(0)) 70 71 static void cv_unsleep(lwp_t *, bool); 72 static inline void cv_wakeup_one(kcondvar_t *); 73 static inline void cv_wakeup_all(kcondvar_t *); 74 75 syncobj_t cv_syncobj = { 76 .sobj_name = "cv", 77 .sobj_flag = SOBJ_SLEEPQ_SORTED, 78 .sobj_boostpri = PRI_KERNEL, 79 .sobj_unsleep = cv_unsleep, 80 .sobj_changepri = sleepq_changepri, 81 .sobj_lendpri = sleepq_lendpri, 82 .sobj_owner = syncobj_noowner, 83 }; 84 85 static const char deadcv[] = "deadcv"; 86 87 /* 88 * cv_init: 89 * 90 * Initialize a condition variable for use. 91 */ 92 void 93 cv_init(kcondvar_t *cv, const char *wmesg) 94 { 95 96 KASSERT(wmesg != NULL); 97 CV_SET_WMESG(cv, wmesg); 98 sleepq_init(CV_SLEEPQ(cv)); 99 } 100 101 /* 102 * cv_destroy: 103 * 104 * Tear down a condition variable. 105 */ 106 void 107 cv_destroy(kcondvar_t *cv) 108 { 109 110 sleepq_destroy(CV_SLEEPQ(cv)); 111 #ifdef DIAGNOSTIC 112 KASSERT(cv_is_valid(cv)); 113 KASSERT(!cv_has_waiters(cv)); 114 CV_SET_WMESG(cv, deadcv); 115 #endif 116 } 117 118 /* 119 * cv_enter: 120 * 121 * Look up and lock the sleep queue corresponding to the given 122 * condition variable, and increment the number of waiters. 123 */ 124 static inline int 125 cv_enter(kcondvar_t *cv, kmutex_t *mtx, lwp_t *l, bool catch_p) 126 { 127 sleepq_t *sq; 128 kmutex_t *mp; 129 int nlocks; 130 131 KASSERT(cv_is_valid(cv)); 132 KASSERT(!cpu_intr_p()); 133 KASSERT((l->l_pflag & LP_INTR) == 0 || panicstr != NULL); 134 135 mp = sleepq_hashlock(cv); 136 sq = CV_SLEEPQ(cv); 137 nlocks = sleepq_enter(sq, l, mp); 138 sleepq_enqueue(sq, cv, CV_WMESG(cv), &cv_syncobj, catch_p); 139 mutex_exit(mtx); 140 KASSERT(cv_has_waiters(cv)); 141 return nlocks; 142 } 143 144 /* 145 * cv_unsleep: 146 * 147 * Remove an LWP from the condition variable and sleep queue. This 148 * is called when the LWP has not been awoken normally but instead 149 * interrupted: for example, when a signal is received. Must be 150 * called with the LWP locked. Will unlock if "unlock" is true. 151 */ 152 static void 153 cv_unsleep(lwp_t *l, bool unlock) 154 { 155 kcondvar_t *cv __diagused; 156 157 cv = (kcondvar_t *)(uintptr_t)l->l_wchan; 158 159 KASSERT(l->l_wchan == (wchan_t)cv); 160 KASSERT(l->l_sleepq == CV_SLEEPQ(cv)); 161 KASSERT(cv_is_valid(cv)); 162 KASSERT(cv_has_waiters(cv)); 163 164 sleepq_unsleep(l, unlock); 165 } 166 167 /* 168 * cv_wait: 169 * 170 * Wait non-interruptably on a condition variable until awoken. 171 */ 172 void 173 cv_wait(kcondvar_t *cv, kmutex_t *mtx) 174 { 175 lwp_t *l = curlwp; 176 int nlocks; 177 178 KASSERT(mutex_owned(mtx)); 179 180 nlocks = cv_enter(cv, mtx, l, false); 181 (void)sleepq_block(0, false, &cv_syncobj, nlocks); 182 mutex_enter(mtx); 183 } 184 185 /* 186 * cv_wait_sig: 187 * 188 * Wait on a condition variable until a awoken or a signal is received. 189 * Will also return early if the process is exiting. Returns zero if 190 * awoken normally, ERESTART if a signal was received and the system 191 * call is restartable, or EINTR otherwise. 192 */ 193 int 194 cv_wait_sig(kcondvar_t *cv, kmutex_t *mtx) 195 { 196 lwp_t *l = curlwp; 197 int error, nlocks; 198 199 KASSERT(mutex_owned(mtx)); 200 201 nlocks = cv_enter(cv, mtx, l, true); 202 error = sleepq_block(0, true, &cv_syncobj, nlocks); 203 mutex_enter(mtx); 204 return error; 205 } 206 207 /* 208 * cv_timedwait: 209 * 210 * Wait on a condition variable until awoken or the specified timeout 211 * expires. Returns zero if awoken normally or EWOULDBLOCK if the 212 * timeout expired. 213 * 214 * timo is a timeout in ticks. timo = 0 specifies an infinite timeout. 215 */ 216 int 217 cv_timedwait(kcondvar_t *cv, kmutex_t *mtx, int timo) 218 { 219 lwp_t *l = curlwp; 220 int error, nlocks; 221 222 KASSERT(mutex_owned(mtx)); 223 224 nlocks = cv_enter(cv, mtx, l, false); 225 error = sleepq_block(timo, false, &cv_syncobj, nlocks); 226 mutex_enter(mtx); 227 return error; 228 } 229 230 /* 231 * cv_timedwait_sig: 232 * 233 * Wait on a condition variable until a timeout expires, awoken or a 234 * signal is received. Will also return early if the process is 235 * exiting. Returns zero if awoken normally, EWOULDBLOCK if the 236 * timeout expires, ERESTART if a signal was received and the system 237 * call is restartable, or EINTR otherwise. 238 * 239 * timo is a timeout in ticks. timo = 0 specifies an infinite timeout. 240 */ 241 int 242 cv_timedwait_sig(kcondvar_t *cv, kmutex_t *mtx, int timo) 243 { 244 lwp_t *l = curlwp; 245 int error, nlocks; 246 247 KASSERT(mutex_owned(mtx)); 248 249 nlocks = cv_enter(cv, mtx, l, true); 250 error = sleepq_block(timo, true, &cv_syncobj, nlocks); 251 mutex_enter(mtx); 252 return error; 253 } 254 255 /* 256 * Given a number of seconds, sec, and 2^64ths of a second, frac, we 257 * want a number of ticks for a timeout: 258 * 259 * timo = hz*(sec + frac/2^64) 260 * = hz*sec + hz*frac/2^64 261 * = hz*sec + hz*(frachi*2^32 + fraclo)/2^64 262 * = hz*sec + hz*frachi/2^32 + hz*fraclo/2^64, 263 * 264 * where frachi is the high 32 bits of frac and fraclo is the 265 * low 32 bits. 266 * 267 * We assume hz < INT_MAX/2 < UINT32_MAX, so 268 * 269 * hz*fraclo/2^64 < fraclo*2^32/2^64 <= 1, 270 * 271 * since fraclo < 2^32. 272 * 273 * We clamp the result at INT_MAX/2 for a timeout in ticks, since we 274 * can't represent timeouts higher than INT_MAX in cv_timedwait, and 275 * spurious wakeup is OK. Moreover, we don't want to wrap around, 276 * because we compute end - start in ticks in order to compute the 277 * remaining timeout, and that difference cannot wrap around, so we use 278 * a timeout less than INT_MAX. Using INT_MAX/2 provides plenty of 279 * margin for paranoia and will exceed most waits in practice by far. 280 */ 281 static unsigned 282 bintime2timo(const struct bintime *bt) 283 { 284 285 KASSERT(hz < INT_MAX/2); 286 CTASSERT(INT_MAX/2 < UINT32_MAX); 287 if (bt->sec > ((INT_MAX/2)/hz)) 288 return INT_MAX/2; 289 if ((hz*(bt->frac >> 32) >> 32) > (INT_MAX/2 - hz*bt->sec)) 290 return INT_MAX/2; 291 292 return hz*bt->sec + (hz*(bt->frac >> 32) >> 32); 293 } 294 295 /* 296 * timo is in units of ticks. We want units of seconds and 2^64ths of 297 * a second. We know hz = 1 sec/tick, and 2^64 = 1 sec/(2^64th of a 298 * second), from which we can conclude 2^64 / hz = 1 (2^64th of a 299 * second)/tick. So for the fractional part, we compute 300 * 301 * frac = rem * 2^64 / hz 302 * = ((rem * 2^32) / hz) * 2^32 303 * 304 * Using truncating integer division instead of real division will 305 * leave us with only about 32 bits of precision, which means about 306 * 1/4-nanosecond resolution, which is good enough for our purposes. 307 */ 308 static struct bintime 309 timo2bintime(unsigned timo) 310 { 311 312 return (struct bintime) { 313 .sec = timo / hz, 314 .frac = (((uint64_t)(timo % hz) << 32)/hz << 32), 315 }; 316 } 317 318 /* 319 * cv_timedwaitbt: 320 * 321 * Wait on a condition variable until awoken or the specified 322 * timeout expires. Returns zero if awoken normally or 323 * EWOULDBLOCK if the timeout expires. 324 * 325 * On entry, bt is a timeout in bintime. cv_timedwaitbt subtracts 326 * the time slept, so on exit, bt is the time remaining after 327 * sleeping, possibly negative if the complete time has elapsed. 328 * No infinite timeout; use cv_wait_sig instead. 329 * 330 * epsilon is a requested maximum error in timeout (excluding 331 * spurious wakeups). Currently not used, will be used in the 332 * future to choose between low- and high-resolution timers. 333 * Actual wakeup time will be somewhere in [t, t + max(e, r) + s) 334 * where r is the finest resolution of clock available and s is 335 * scheduling delays for scheduler overhead and competing threads. 336 * Time is measured by the interrupt source implementing the 337 * timeout, not by another timecounter. 338 */ 339 int 340 cv_timedwaitbt(kcondvar_t *cv, kmutex_t *mtx, struct bintime *bt, 341 const struct bintime *epsilon __diagused) 342 { 343 struct bintime slept; 344 unsigned start, end; 345 int timo; 346 int error; 347 348 KASSERTMSG(bt->sec >= 0, "negative timeout"); 349 KASSERTMSG(epsilon != NULL, "specify maximum requested delay"); 350 351 /* If there's nothing left to wait, time out. */ 352 if (bt->sec == 0 && bt->frac == 0) 353 return EWOULDBLOCK; 354 355 /* Convert to ticks, but clamp to be >=1. */ 356 timo = bintime2timo(bt); 357 KASSERTMSG(timo >= 0, "negative ticks: %d", timo); 358 if (timo == 0) 359 timo = 1; 360 361 /* 362 * getticks() is technically int, but nothing special 363 * happens instead of overflow, so we assume two's-complement 364 * wraparound and just treat it as unsigned. 365 */ 366 start = getticks(); 367 error = cv_timedwait(cv, mtx, timo); 368 end = getticks(); 369 370 /* 371 * Set it to the time left, or zero, whichever is larger. We 372 * do not fail with EWOULDBLOCK here because this may have been 373 * an explicit wakeup, so the caller needs to check before they 374 * give up or else cv_signal would be lost. 375 */ 376 slept = timo2bintime(end - start); 377 if (bintimecmp(bt, &slept, <=)) { 378 bt->sec = 0; 379 bt->frac = 0; 380 } else { 381 /* bt := bt - slept */ 382 bintime_sub(bt, &slept); 383 } 384 385 return error; 386 } 387 388 /* 389 * cv_timedwaitbt_sig: 390 * 391 * Wait on a condition variable until awoken, the specified 392 * timeout expires, or interrupted by a signal. Returns zero if 393 * awoken normally, EWOULDBLOCK if the timeout expires, or 394 * EINTR/ERESTART if interrupted by a signal. 395 * 396 * On entry, bt is a timeout in bintime. cv_timedwaitbt_sig 397 * subtracts the time slept, so on exit, bt is the time remaining 398 * after sleeping. No infinite timeout; use cv_wait instead. 399 * 400 * epsilon is a requested maximum error in timeout (excluding 401 * spurious wakeups). Currently not used, will be used in the 402 * future to choose between low- and high-resolution timers. 403 */ 404 int 405 cv_timedwaitbt_sig(kcondvar_t *cv, kmutex_t *mtx, struct bintime *bt, 406 const struct bintime *epsilon __diagused) 407 { 408 struct bintime slept; 409 unsigned start, end; 410 int timo; 411 int error; 412 413 KASSERTMSG(bt->sec >= 0, "negative timeout"); 414 KASSERTMSG(epsilon != NULL, "specify maximum requested delay"); 415 416 /* If there's nothing left to wait, time out. */ 417 if (bt->sec == 0 && bt->frac == 0) 418 return EWOULDBLOCK; 419 420 /* Convert to ticks, but clamp to be >=1. */ 421 timo = bintime2timo(bt); 422 KASSERTMSG(timo >= 0, "negative ticks: %d", timo); 423 if (timo == 0) 424 timo = 1; 425 426 /* 427 * getticks() is technically int, but nothing special 428 * happens instead of overflow, so we assume two's-complement 429 * wraparound and just treat it as unsigned. 430 */ 431 start = getticks(); 432 error = cv_timedwait_sig(cv, mtx, timo); 433 end = getticks(); 434 435 /* 436 * Set it to the time left, or zero, whichever is larger. We 437 * do not fail with EWOULDBLOCK here because this may have been 438 * an explicit wakeup, so the caller needs to check before they 439 * give up or else cv_signal would be lost. 440 */ 441 slept = timo2bintime(end - start); 442 if (bintimecmp(bt, &slept, <=)) { 443 bt->sec = 0; 444 bt->frac = 0; 445 } else { 446 /* bt := bt - slept */ 447 bintime_sub(bt, &slept); 448 } 449 450 return error; 451 } 452 453 /* 454 * cv_signal: 455 * 456 * Wake the highest priority LWP waiting on a condition variable. Must 457 * be called with the interlocking mutex held or just after it has been 458 * released (so the awoken LWP will see the changed condition). 459 */ 460 void 461 cv_signal(kcondvar_t *cv) 462 { 463 464 KASSERT(cv_is_valid(cv)); 465 466 if (__predict_false(!LIST_EMPTY(CV_SLEEPQ(cv)))) { 467 /* 468 * Compiler turns into a tail call usually, i.e. jmp, 469 * because the arguments are the same and no locals. 470 */ 471 cv_wakeup_one(cv); 472 } 473 } 474 475 /* 476 * cv_wakeup_one: 477 * 478 * Slow path for cv_signal(). Deliberately marked __noinline to 479 * prevent the compiler pulling it in to cv_signal(), which adds 480 * extra prologue and epilogue code. 481 */ 482 static __noinline void 483 cv_wakeup_one(kcondvar_t *cv) 484 { 485 sleepq_t *sq; 486 kmutex_t *mp; 487 lwp_t *l; 488 489 mp = sleepq_hashlock(cv); 490 sq = CV_SLEEPQ(cv); 491 if (__predict_true((l = LIST_FIRST(sq)) != NULL)) { 492 KASSERT(l->l_sleepq == sq); 493 KASSERT(l->l_mutex == mp); 494 KASSERT(l->l_wchan == cv); 495 sleepq_remove(sq, l, true); 496 } 497 mutex_spin_exit(mp); 498 } 499 500 /* 501 * cv_broadcast: 502 * 503 * Wake all LWPs waiting on a condition variable. Must be called with 504 * the interlocking mutex held or just after it has been released (so 505 * the awoken LWP will see the changed condition). 506 */ 507 void 508 cv_broadcast(kcondvar_t *cv) 509 { 510 511 KASSERT(cv_is_valid(cv)); 512 513 if (__predict_false(!LIST_EMPTY(CV_SLEEPQ(cv)))) { 514 /* 515 * Compiler turns into a tail call usually, i.e. jmp, 516 * because the arguments are the same and no locals. 517 */ 518 cv_wakeup_all(cv); 519 } 520 } 521 522 /* 523 * cv_wakeup_all: 524 * 525 * Slow path for cv_broadcast(). Deliberately marked __noinline to 526 * prevent the compiler pulling it in to cv_broadcast(), which adds 527 * extra prologue and epilogue code. 528 */ 529 static __noinline void 530 cv_wakeup_all(kcondvar_t *cv) 531 { 532 sleepq_t *sq; 533 kmutex_t *mp; 534 lwp_t *l; 535 536 mp = sleepq_hashlock(cv); 537 sq = CV_SLEEPQ(cv); 538 while ((l = LIST_FIRST(sq)) != NULL) { 539 KASSERT(l->l_sleepq == sq); 540 KASSERT(l->l_mutex == mp); 541 KASSERT(l->l_wchan == cv); 542 sleepq_remove(sq, l, true); 543 } 544 mutex_spin_exit(mp); 545 } 546 547 /* 548 * cv_has_waiters: 549 * 550 * For diagnostic assertions: return non-zero if a condition 551 * variable has waiters. 552 */ 553 bool 554 cv_has_waiters(kcondvar_t *cv) 555 { 556 557 return !LIST_EMPTY(CV_SLEEPQ(cv)); 558 } 559 560 /* 561 * cv_is_valid: 562 * 563 * For diagnostic assertions: return non-zero if a condition 564 * variable appears to be valid. No locks need be held. 565 */ 566 bool 567 cv_is_valid(kcondvar_t *cv) 568 { 569 570 return CV_WMESG(cv) != deadcv && CV_WMESG(cv) != NULL; 571 } 572