xref: /netbsd-src/sys/kern/kern_condvar.c (revision 901e7e84758515fbf39dfc064cb0b45ab146d8b0)
1 /*	$NetBSD: kern_condvar.c,v 1.55 2023/07/17 12:54:29 riastradh Exp $	*/
2 
3 /*-
4  * Copyright (c) 2006, 2007, 2008, 2019, 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Kernel condition variable implementation.
34  */
35 
36 #include <sys/cdefs.h>
37 __KERNEL_RCSID(0, "$NetBSD: kern_condvar.c,v 1.55 2023/07/17 12:54:29 riastradh Exp $");
38 
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/lwp.h>
42 #include <sys/condvar.h>
43 #include <sys/sleepq.h>
44 #include <sys/lockdebug.h>
45 #include <sys/cpu.h>
46 #include <sys/kernel.h>
47 
48 /*
49  * Accessors for the private contents of the kcondvar_t data type.
50  *
51  *	cv_opaque[0]	sleepq_t
52  *	cv_opaque[1]	description for ps(1)
53  *
54  * cv_opaque[0] is protected by the interlock passed to cv_wait() (enqueue
55  * only), and the sleep queue lock acquired with sleepq_hashlock() (enqueue
56  * and dequeue).
57  *
58  * cv_opaque[1] (the wmesg) is static and does not change throughout the life
59  * of the CV.
60  */
61 #define	CV_SLEEPQ(cv)		((sleepq_t *)(cv)->cv_opaque)
62 #define	CV_WMESG(cv)		((const char *)(cv)->cv_opaque[1])
63 #define	CV_SET_WMESG(cv, v) 	(cv)->cv_opaque[1] = __UNCONST(v)
64 
65 #define	CV_DEBUG_P(cv)	(CV_WMESG(cv) != nodebug)
66 #define	CV_RA		((uintptr_t)__builtin_return_address(0))
67 
68 static void		cv_unsleep(lwp_t *, bool);
69 static inline void	cv_wakeup_one(kcondvar_t *);
70 static inline void	cv_wakeup_all(kcondvar_t *);
71 
72 syncobj_t cv_syncobj = {
73 	.sobj_name	= "cv",
74 	.sobj_flag	= SOBJ_SLEEPQ_SORTED,
75 	.sobj_unsleep	= cv_unsleep,
76 	.sobj_changepri	= sleepq_changepri,
77 	.sobj_lendpri	= sleepq_lendpri,
78 	.sobj_owner	= syncobj_noowner,
79 };
80 
81 static const char deadcv[] = "deadcv";
82 
83 /*
84  * cv_init:
85  *
86  *	Initialize a condition variable for use.
87  */
88 void
89 cv_init(kcondvar_t *cv, const char *wmesg)
90 {
91 
92 	KASSERT(wmesg != NULL);
93 	CV_SET_WMESG(cv, wmesg);
94 	sleepq_init(CV_SLEEPQ(cv));
95 }
96 
97 /*
98  * cv_destroy:
99  *
100  *	Tear down a condition variable.
101  */
102 void
103 cv_destroy(kcondvar_t *cv)
104 {
105 
106 	sleepq_destroy(CV_SLEEPQ(cv));
107 #ifdef DIAGNOSTIC
108 	KASSERT(cv_is_valid(cv));
109 	KASSERT(!cv_has_waiters(cv));
110 	CV_SET_WMESG(cv, deadcv);
111 #endif
112 }
113 
114 /*
115  * cv_enter:
116  *
117  *	Look up and lock the sleep queue corresponding to the given
118  *	condition variable, and increment the number of waiters.
119  */
120 static inline void
121 cv_enter(kcondvar_t *cv, kmutex_t *mtx, lwp_t *l, bool catch_p)
122 {
123 	sleepq_t *sq;
124 	kmutex_t *mp;
125 
126 	KASSERT(cv_is_valid(cv));
127 	KASSERT(!cpu_intr_p());
128 	KASSERT((l->l_pflag & LP_INTR) == 0 || panicstr != NULL);
129 
130 	l->l_kpriority = true;
131 	mp = sleepq_hashlock(cv);
132 	sq = CV_SLEEPQ(cv);
133 	sleepq_enter(sq, l, mp);
134 	sleepq_enqueue(sq, cv, CV_WMESG(cv), &cv_syncobj, catch_p);
135 	mutex_exit(mtx);
136 	KASSERT(cv_has_waiters(cv));
137 }
138 
139 /*
140  * cv_unsleep:
141  *
142  *	Remove an LWP from the condition variable and sleep queue.  This
143  *	is called when the LWP has not been awoken normally but instead
144  *	interrupted: for example, when a signal is received.  Must be
145  *	called with the LWP locked.  Will unlock if "unlock" is true.
146  */
147 static void
148 cv_unsleep(lwp_t *l, bool unlock)
149 {
150 	kcondvar_t *cv __diagused;
151 
152 	cv = (kcondvar_t *)(uintptr_t)l->l_wchan;
153 
154 	KASSERT(l->l_wchan == (wchan_t)cv);
155 	KASSERT(l->l_sleepq == CV_SLEEPQ(cv));
156 	KASSERT(cv_is_valid(cv));
157 	KASSERT(cv_has_waiters(cv));
158 
159 	sleepq_unsleep(l, unlock);
160 }
161 
162 /*
163  * cv_wait:
164  *
165  *	Wait non-interruptably on a condition variable until awoken.
166  */
167 void
168 cv_wait(kcondvar_t *cv, kmutex_t *mtx)
169 {
170 	lwp_t *l = curlwp;
171 
172 	KASSERT(mutex_owned(mtx));
173 
174 	cv_enter(cv, mtx, l, false);
175 	(void)sleepq_block(0, false, &cv_syncobj);
176 	mutex_enter(mtx);
177 }
178 
179 /*
180  * cv_wait_sig:
181  *
182  *	Wait on a condition variable until a awoken or a signal is received.
183  *	Will also return early if the process is exiting.  Returns zero if
184  *	awoken normally, ERESTART if a signal was received and the system
185  *	call is restartable, or EINTR otherwise.
186  */
187 int
188 cv_wait_sig(kcondvar_t *cv, kmutex_t *mtx)
189 {
190 	lwp_t *l = curlwp;
191 	int error;
192 
193 	KASSERT(mutex_owned(mtx));
194 
195 	cv_enter(cv, mtx, l, true);
196 	error = sleepq_block(0, true, &cv_syncobj);
197 	mutex_enter(mtx);
198 	return error;
199 }
200 
201 /*
202  * cv_timedwait:
203  *
204  *	Wait on a condition variable until awoken or the specified timeout
205  *	expires.  Returns zero if awoken normally or EWOULDBLOCK if the
206  *	timeout expired.
207  *
208  *	timo is a timeout in ticks.  timo = 0 specifies an infinite timeout.
209  */
210 int
211 cv_timedwait(kcondvar_t *cv, kmutex_t *mtx, int timo)
212 {
213 	lwp_t *l = curlwp;
214 	int error;
215 
216 	KASSERT(mutex_owned(mtx));
217 
218 	cv_enter(cv, mtx, l, false);
219 	error = sleepq_block(timo, false, &cv_syncobj);
220 	mutex_enter(mtx);
221 	return error;
222 }
223 
224 /*
225  * cv_timedwait_sig:
226  *
227  *	Wait on a condition variable until a timeout expires, awoken or a
228  *	signal is received.  Will also return early if the process is
229  *	exiting.  Returns zero if awoken normally, EWOULDBLOCK if the
230  *	timeout expires, ERESTART if a signal was received and the system
231  *	call is restartable, or EINTR otherwise.
232  *
233  *	timo is a timeout in ticks.  timo = 0 specifies an infinite timeout.
234  */
235 int
236 cv_timedwait_sig(kcondvar_t *cv, kmutex_t *mtx, int timo)
237 {
238 	lwp_t *l = curlwp;
239 	int error;
240 
241 	KASSERT(mutex_owned(mtx));
242 
243 	cv_enter(cv, mtx, l, true);
244 	error = sleepq_block(timo, true, &cv_syncobj);
245 	mutex_enter(mtx);
246 	return error;
247 }
248 
249 /*
250  * Given a number of seconds, sec, and 2^64ths of a second, frac, we
251  * want a number of ticks for a timeout:
252  *
253  *	timo = hz*(sec + frac/2^64)
254  *	     = hz*sec + hz*frac/2^64
255  *	     = hz*sec + hz*(frachi*2^32 + fraclo)/2^64
256  *	     = hz*sec + hz*frachi/2^32 + hz*fraclo/2^64,
257  *
258  * where frachi is the high 32 bits of frac and fraclo is the
259  * low 32 bits.
260  *
261  * We assume hz < INT_MAX/2 < UINT32_MAX, so
262  *
263  *	hz*fraclo/2^64 < fraclo*2^32/2^64 <= 1,
264  *
265  * since fraclo < 2^32.
266  *
267  * We clamp the result at INT_MAX/2 for a timeout in ticks, since we
268  * can't represent timeouts higher than INT_MAX in cv_timedwait, and
269  * spurious wakeup is OK.  Moreover, we don't want to wrap around,
270  * because we compute end - start in ticks in order to compute the
271  * remaining timeout, and that difference cannot wrap around, so we use
272  * a timeout less than INT_MAX.  Using INT_MAX/2 provides plenty of
273  * margin for paranoia and will exceed most waits in practice by far.
274  */
275 static unsigned
276 bintime2timo(const struct bintime *bt)
277 {
278 
279 	KASSERT(hz < INT_MAX/2);
280 	CTASSERT(INT_MAX/2 < UINT32_MAX);
281 	if (bt->sec > ((INT_MAX/2)/hz))
282 		return INT_MAX/2;
283 	if ((hz*(bt->frac >> 32) >> 32) > (INT_MAX/2 - hz*bt->sec))
284 		return INT_MAX/2;
285 
286 	return hz*bt->sec + (hz*(bt->frac >> 32) >> 32);
287 }
288 
289 /*
290  * timo is in units of ticks.  We want units of seconds and 2^64ths of
291  * a second.  We know hz = 1 sec/tick, and 2^64 = 1 sec/(2^64th of a
292  * second), from which we can conclude 2^64 / hz = 1 (2^64th of a
293  * second)/tick.  So for the fractional part, we compute
294  *
295  *	frac = rem * 2^64 / hz
296  *	     = ((rem * 2^32) / hz) * 2^32
297  *
298  * Using truncating integer division instead of real division will
299  * leave us with only about 32 bits of precision, which means about
300  * 1/4-nanosecond resolution, which is good enough for our purposes.
301  */
302 static struct bintime
303 timo2bintime(unsigned timo)
304 {
305 
306 	return (struct bintime) {
307 		.sec = timo / hz,
308 		.frac = (((uint64_t)(timo % hz) << 32)/hz << 32),
309 	};
310 }
311 
312 /*
313  * cv_timedwaitbt:
314  *
315  *	Wait on a condition variable until awoken or the specified
316  *	timeout expires.  Returns zero if awoken normally or
317  *	EWOULDBLOCK if the timeout expires.
318  *
319  *	On entry, bt is a timeout in bintime.  cv_timedwaitbt subtracts
320  *	the time slept, so on exit, bt is the time remaining after
321  *	sleeping, possibly negative if the complete time has elapsed.
322  *	No infinite timeout; use cv_wait_sig instead.
323  *
324  *	epsilon is a requested maximum error in timeout (excluding
325  *	spurious wakeups).  Currently not used, will be used in the
326  *	future to choose between low- and high-resolution timers.
327  *	Actual wakeup time will be somewhere in [t, t + max(e, r) + s)
328  *	where r is the finest resolution of clock available and s is
329  *	scheduling delays for scheduler overhead and competing threads.
330  *	Time is measured by the interrupt source implementing the
331  *	timeout, not by another timecounter.
332  */
333 int
334 cv_timedwaitbt(kcondvar_t *cv, kmutex_t *mtx, struct bintime *bt,
335     const struct bintime *epsilon __diagused)
336 {
337 	struct bintime slept;
338 	unsigned start, end;
339 	int timo;
340 	int error;
341 
342 	KASSERTMSG(bt->sec >= 0, "negative timeout");
343 	KASSERTMSG(epsilon != NULL, "specify maximum requested delay");
344 
345 	/* If there's nothing left to wait, time out.  */
346 	if (bt->sec == 0 && bt->frac == 0)
347 		return EWOULDBLOCK;
348 
349 	/* Convert to ticks, but clamp to be >=1.  */
350 	timo = bintime2timo(bt);
351 	KASSERTMSG(timo >= 0, "negative ticks: %d", timo);
352 	if (timo == 0)
353 		timo = 1;
354 
355 	/*
356 	 * getticks() is technically int, but nothing special
357 	 * happens instead of overflow, so we assume two's-complement
358 	 * wraparound and just treat it as unsigned.
359 	 */
360 	start = getticks();
361 	error = cv_timedwait(cv, mtx, timo);
362 	end = getticks();
363 
364 	/*
365 	 * Set it to the time left, or zero, whichever is larger.  We
366 	 * do not fail with EWOULDBLOCK here because this may have been
367 	 * an explicit wakeup, so the caller needs to check before they
368 	 * give up or else cv_signal would be lost.
369 	 */
370 	slept = timo2bintime(end - start);
371 	if (bintimecmp(bt, &slept, <=)) {
372 		bt->sec = 0;
373 		bt->frac = 0;
374 	} else {
375 		/* bt := bt - slept */
376 		bintime_sub(bt, &slept);
377 	}
378 
379 	return error;
380 }
381 
382 /*
383  * cv_timedwaitbt_sig:
384  *
385  *	Wait on a condition variable until awoken, the specified
386  *	timeout expires, or interrupted by a signal.  Returns zero if
387  *	awoken normally, EWOULDBLOCK if the timeout expires, or
388  *	EINTR/ERESTART if interrupted by a signal.
389  *
390  *	On entry, bt is a timeout in bintime.  cv_timedwaitbt_sig
391  *	subtracts the time slept, so on exit, bt is the time remaining
392  *	after sleeping.  No infinite timeout; use cv_wait instead.
393  *
394  *	epsilon is a requested maximum error in timeout (excluding
395  *	spurious wakeups).  Currently not used, will be used in the
396  *	future to choose between low- and high-resolution timers.
397  */
398 int
399 cv_timedwaitbt_sig(kcondvar_t *cv, kmutex_t *mtx, struct bintime *bt,
400     const struct bintime *epsilon __diagused)
401 {
402 	struct bintime slept;
403 	unsigned start, end;
404 	int timo;
405 	int error;
406 
407 	KASSERTMSG(bt->sec >= 0, "negative timeout");
408 	KASSERTMSG(epsilon != NULL, "specify maximum requested delay");
409 
410 	/* If there's nothing left to wait, time out.  */
411 	if (bt->sec == 0 && bt->frac == 0)
412 		return EWOULDBLOCK;
413 
414 	/* Convert to ticks, but clamp to be >=1.  */
415 	timo = bintime2timo(bt);
416 	KASSERTMSG(timo >= 0, "negative ticks: %d", timo);
417 	if (timo == 0)
418 		timo = 1;
419 
420 	/*
421 	 * getticks() is technically int, but nothing special
422 	 * happens instead of overflow, so we assume two's-complement
423 	 * wraparound and just treat it as unsigned.
424 	 */
425 	start = getticks();
426 	error = cv_timedwait_sig(cv, mtx, timo);
427 	end = getticks();
428 
429 	/*
430 	 * Set it to the time left, or zero, whichever is larger.  We
431 	 * do not fail with EWOULDBLOCK here because this may have been
432 	 * an explicit wakeup, so the caller needs to check before they
433 	 * give up or else cv_signal would be lost.
434 	 */
435 	slept = timo2bintime(end - start);
436 	if (bintimecmp(bt, &slept, <=)) {
437 		bt->sec = 0;
438 		bt->frac = 0;
439 	} else {
440 		/* bt := bt - slept */
441 		bintime_sub(bt, &slept);
442 	}
443 
444 	return error;
445 }
446 
447 /*
448  * cv_signal:
449  *
450  *	Wake the highest priority LWP waiting on a condition variable.
451  *	Must be called with the interlocking mutex held.
452  */
453 void
454 cv_signal(kcondvar_t *cv)
455 {
456 
457 	KASSERT(cv_is_valid(cv));
458 
459 	if (__predict_false(!LIST_EMPTY(CV_SLEEPQ(cv))))
460 		cv_wakeup_one(cv);
461 }
462 
463 /*
464  * cv_wakeup_one:
465  *
466  *	Slow path for cv_signal().  Deliberately marked __noinline to
467  *	prevent the compiler pulling it in to cv_signal(), which adds
468  *	extra prologue and epilogue code.
469  */
470 static __noinline void
471 cv_wakeup_one(kcondvar_t *cv)
472 {
473 	sleepq_t *sq;
474 	kmutex_t *mp;
475 	lwp_t *l;
476 
477 	/*
478 	 * Keep waking LWPs until a non-interruptable waiter is found.  An
479 	 * interruptable waiter could fail to do something useful with the
480 	 * wakeup due to an error return from cv_[timed]wait_sig(), and the
481 	 * caller of cv_signal() may not expect such a scenario.
482 	 *
483 	 * This isn't a problem for non-interruptable waits (untimed and
484 	 * timed), because if such a waiter is woken here it will not return
485 	 * an error.
486 	 */
487 	mp = sleepq_hashlock(cv);
488 	sq = CV_SLEEPQ(cv);
489 	while ((l = LIST_FIRST(sq)) != NULL) {
490 		KASSERT(l->l_sleepq == sq);
491 		KASSERT(l->l_mutex == mp);
492 		KASSERT(l->l_wchan == cv);
493 		if ((l->l_flag & LW_SINTR) == 0) {
494 			sleepq_remove(sq, l);
495 			break;
496 		} else
497 			sleepq_remove(sq, l);
498 	}
499 	mutex_spin_exit(mp);
500 }
501 
502 /*
503  * cv_broadcast:
504  *
505  *	Wake all LWPs waiting on a condition variable.  Must be called
506  *	with the interlocking mutex held.
507  */
508 void
509 cv_broadcast(kcondvar_t *cv)
510 {
511 
512 	KASSERT(cv_is_valid(cv));
513 
514 	if (__predict_false(!LIST_EMPTY(CV_SLEEPQ(cv))))
515 		cv_wakeup_all(cv);
516 }
517 
518 /*
519  * cv_wakeup_all:
520  *
521  *	Slow path for cv_broadcast().  Deliberately marked __noinline to
522  *	prevent the compiler pulling it in to cv_broadcast(), which adds
523  *	extra prologue and epilogue code.
524  */
525 static __noinline void
526 cv_wakeup_all(kcondvar_t *cv)
527 {
528 	sleepq_t *sq;
529 	kmutex_t *mp;
530 	lwp_t *l;
531 
532 	mp = sleepq_hashlock(cv);
533 	sq = CV_SLEEPQ(cv);
534 	while ((l = LIST_FIRST(sq)) != NULL) {
535 		KASSERT(l->l_sleepq == sq);
536 		KASSERT(l->l_mutex == mp);
537 		KASSERT(l->l_wchan == cv);
538 		sleepq_remove(sq, l);
539 	}
540 	mutex_spin_exit(mp);
541 }
542 
543 /*
544  * cv_has_waiters:
545  *
546  *	For diagnostic assertions: return non-zero if a condition
547  *	variable has waiters.
548  */
549 bool
550 cv_has_waiters(kcondvar_t *cv)
551 {
552 
553 	return !LIST_EMPTY(CV_SLEEPQ(cv));
554 }
555 
556 /*
557  * cv_is_valid:
558  *
559  *	For diagnostic assertions: return non-zero if a condition
560  *	variable appears to be valid.  No locks need be held.
561  */
562 bool
563 cv_is_valid(kcondvar_t *cv)
564 {
565 
566 	return CV_WMESG(cv) != deadcv && CV_WMESG(cv) != NULL;
567 }
568