xref: /netbsd-src/sys/kern/kern_condvar.c (revision bdc22b2e01993381dcefeff2bc9b56ca75a4235c)
1 /*	$NetBSD: kern_condvar.c,v 1.41 2018/01/30 07:52:22 ozaki-r Exp $	*/
2 
3 /*-
4  * Copyright (c) 2006, 2007, 2008 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Kernel condition variable implementation.
34  */
35 
36 #include <sys/cdefs.h>
37 __KERNEL_RCSID(0, "$NetBSD: kern_condvar.c,v 1.41 2018/01/30 07:52:22 ozaki-r Exp $");
38 
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/lwp.h>
42 #include <sys/condvar.h>
43 #include <sys/sleepq.h>
44 #include <sys/lockdebug.h>
45 #include <sys/cpu.h>
46 #include <sys/kernel.h>
47 
48 /*
49  * Accessors for the private contents of the kcondvar_t data type.
50  *
51  *	cv_opaque[0]	sleepq...
52  *	cv_opaque[1]	...pointers
53  *	cv_opaque[2]	description for ps(1)
54  *
55  * cv_opaque[0..1] is protected by the interlock passed to cv_wait() (enqueue
56  * only), and the sleep queue lock acquired with sleeptab_lookup() (enqueue
57  * and dequeue).
58  *
59  * cv_opaque[2] (the wmesg) is static and does not change throughout the life
60  * of the CV.
61  */
62 #define	CV_SLEEPQ(cv)		((sleepq_t *)(cv)->cv_opaque)
63 #define	CV_WMESG(cv)		((const char *)(cv)->cv_opaque[2])
64 #define	CV_SET_WMESG(cv, v) 	(cv)->cv_opaque[2] = __UNCONST(v)
65 
66 #define	CV_DEBUG_P(cv)	(CV_WMESG(cv) != nodebug)
67 #define	CV_RA		((uintptr_t)__builtin_return_address(0))
68 
69 static void		cv_unsleep(lwp_t *, bool);
70 static inline void	cv_wakeup_one(kcondvar_t *);
71 static inline void	cv_wakeup_all(kcondvar_t *);
72 
73 static syncobj_t cv_syncobj = {
74 	.sobj_flag	= SOBJ_SLEEPQ_SORTED,
75 	.sobj_unsleep	= cv_unsleep,
76 	.sobj_changepri	= sleepq_changepri,
77 	.sobj_lendpri	= sleepq_lendpri,
78 	.sobj_owner	= syncobj_noowner,
79 };
80 
81 lockops_t cv_lockops = {
82 	.lo_name = "Condition variable",
83 	.lo_type = LOCKOPS_CV,
84 	.lo_dump = NULL,
85 };
86 
87 static const char deadcv[] = "deadcv";
88 #ifdef LOCKDEBUG
89 static const char nodebug[] = "nodebug";
90 
91 #define CV_LOCKDEBUG_HANDOFF(l, cv) cv_lockdebug_handoff(l, cv)
92 #define CV_LOCKDEBUG_PROCESS(l, cv) cv_lockdebug_process(l, cv)
93 
94 static inline void
95 cv_lockdebug_handoff(lwp_t *l, kcondvar_t *cv)
96 {
97 
98 	if (CV_DEBUG_P(cv))
99 		l->l_flag |= LW_CVLOCKDEBUG;
100 }
101 
102 static inline void
103 cv_lockdebug_process(lwp_t *l, kcondvar_t *cv)
104 {
105 
106 	if ((l->l_flag & LW_CVLOCKDEBUG) == 0)
107 		return;
108 
109 	l->l_flag &= ~LW_CVLOCKDEBUG;
110 	LOCKDEBUG_UNLOCKED(true, cv, CV_RA, 0);
111 }
112 #else
113 #define CV_LOCKDEBUG_HANDOFF(l, cv) __nothing
114 #define CV_LOCKDEBUG_PROCESS(l, cv) __nothing
115 #endif
116 
117 /*
118  * cv_init:
119  *
120  *	Initialize a condition variable for use.
121  */
122 void
123 cv_init(kcondvar_t *cv, const char *wmesg)
124 {
125 #ifdef LOCKDEBUG
126 	bool dodebug;
127 
128 	dodebug = LOCKDEBUG_ALLOC(cv, &cv_lockops,
129 	    (uintptr_t)__builtin_return_address(0));
130 	if (!dodebug) {
131 		/* XXX This will break vfs_lockf. */
132 		wmesg = nodebug;
133 	}
134 #endif
135 	KASSERT(wmesg != NULL);
136 	CV_SET_WMESG(cv, wmesg);
137 	sleepq_init(CV_SLEEPQ(cv));
138 }
139 
140 /*
141  * cv_destroy:
142  *
143  *	Tear down a condition variable.
144  */
145 void
146 cv_destroy(kcondvar_t *cv)
147 {
148 
149 	LOCKDEBUG_FREE(CV_DEBUG_P(cv), cv);
150 #ifdef DIAGNOSTIC
151 	KASSERT(cv_is_valid(cv));
152 	CV_SET_WMESG(cv, deadcv);
153 #endif
154 }
155 
156 /*
157  * cv_enter:
158  *
159  *	Look up and lock the sleep queue corresponding to the given
160  *	condition variable, and increment the number of waiters.
161  */
162 static inline void
163 cv_enter(kcondvar_t *cv, kmutex_t *mtx, lwp_t *l)
164 {
165 	sleepq_t *sq;
166 	kmutex_t *mp;
167 
168 	KASSERT(cv_is_valid(cv));
169 	KASSERT(!cpu_intr_p());
170 	KASSERT((l->l_pflag & LP_INTR) == 0 || panicstr != NULL);
171 
172 	LOCKDEBUG_LOCKED(CV_DEBUG_P(cv), cv, mtx, CV_RA, 0);
173 
174 	l->l_kpriority = true;
175 	mp = sleepq_hashlock(cv);
176 	sq = CV_SLEEPQ(cv);
177 	sleepq_enter(sq, l, mp);
178 	sleepq_enqueue(sq, cv, CV_WMESG(cv), &cv_syncobj);
179 	mutex_exit(mtx);
180 	KASSERT(cv_has_waiters(cv));
181 }
182 
183 /*
184  * cv_exit:
185  *
186  *	After resuming execution, check to see if we have been restarted
187  *	as a result of cv_signal().  If we have, but cannot take the
188  *	wakeup (because of eg a pending Unix signal or timeout) then try
189  *	to ensure that another LWP sees it.  This is necessary because
190  *	there may be multiple waiters, and at least one should take the
191  *	wakeup if possible.
192  */
193 static inline int
194 cv_exit(kcondvar_t *cv, kmutex_t *mtx, lwp_t *l, const int error)
195 {
196 
197 	mutex_enter(mtx);
198 	if (__predict_false(error != 0))
199 		cv_signal(cv);
200 
201 	LOCKDEBUG_UNLOCKED(CV_DEBUG_P(cv), cv, CV_RA, 0);
202 	KASSERT(cv_is_valid(cv));
203 
204 	return error;
205 }
206 
207 /*
208  * cv_unsleep:
209  *
210  *	Remove an LWP from the condition variable and sleep queue.  This
211  *	is called when the LWP has not been awoken normally but instead
212  *	interrupted: for example, when a signal is received.  Must be
213  *	called with the LWP locked, and must return it unlocked.
214  */
215 static void
216 cv_unsleep(lwp_t *l, bool cleanup)
217 {
218 	kcondvar_t *cv __diagused;
219 
220 	cv = (kcondvar_t *)(uintptr_t)l->l_wchan;
221 
222 	KASSERT(l->l_wchan == (wchan_t)cv);
223 	KASSERT(l->l_sleepq == CV_SLEEPQ(cv));
224 	KASSERT(cv_is_valid(cv));
225 	KASSERT(cv_has_waiters(cv));
226 
227 	sleepq_unsleep(l, cleanup);
228 }
229 
230 /*
231  * cv_wait:
232  *
233  *	Wait non-interruptably on a condition variable until awoken.
234  */
235 void
236 cv_wait(kcondvar_t *cv, kmutex_t *mtx)
237 {
238 	lwp_t *l = curlwp;
239 
240 	KASSERT(mutex_owned(mtx));
241 
242 	cv_enter(cv, mtx, l);
243 
244 	/*
245 	 * We can't use cv_exit() here since the cv might be destroyed before
246 	 * this thread gets a chance to run.  Instead, hand off the lockdebug
247 	 * responsibility to the thread that wakes us up.
248 	 */
249 
250 	CV_LOCKDEBUG_HANDOFF(l, cv);
251 	(void)sleepq_block(0, false);
252 	mutex_enter(mtx);
253 }
254 
255 /*
256  * cv_wait_sig:
257  *
258  *	Wait on a condition variable until a awoken or a signal is received.
259  *	Will also return early if the process is exiting.  Returns zero if
260  *	awoken normally, ERESTART if a signal was received and the system
261  *	call is restartable, or EINTR otherwise.
262  */
263 int
264 cv_wait_sig(kcondvar_t *cv, kmutex_t *mtx)
265 {
266 	lwp_t *l = curlwp;
267 	int error;
268 
269 	KASSERT(mutex_owned(mtx));
270 
271 	cv_enter(cv, mtx, l);
272 	error = sleepq_block(0, true);
273 	return cv_exit(cv, mtx, l, error);
274 }
275 
276 /*
277  * cv_timedwait:
278  *
279  *	Wait on a condition variable until awoken or the specified timeout
280  *	expires.  Returns zero if awoken normally or EWOULDBLOCK if the
281  *	timeout expired.
282  *
283  *	timo is a timeout in ticks.  timo = 0 specifies an infinite timeout.
284  */
285 int
286 cv_timedwait(kcondvar_t *cv, kmutex_t *mtx, int timo)
287 {
288 	lwp_t *l = curlwp;
289 	int error;
290 
291 	KASSERT(mutex_owned(mtx));
292 
293 	cv_enter(cv, mtx, l);
294 	error = sleepq_block(timo, false);
295 	return cv_exit(cv, mtx, l, error);
296 }
297 
298 /*
299  * cv_timedwait_sig:
300  *
301  *	Wait on a condition variable until a timeout expires, awoken or a
302  *	signal is received.  Will also return early if the process is
303  *	exiting.  Returns zero if awoken normally, EWOULDBLOCK if the
304  *	timeout expires, ERESTART if a signal was received and the system
305  *	call is restartable, or EINTR otherwise.
306  *
307  *	timo is a timeout in ticks.  timo = 0 specifies an infinite timeout.
308  */
309 int
310 cv_timedwait_sig(kcondvar_t *cv, kmutex_t *mtx, int timo)
311 {
312 	lwp_t *l = curlwp;
313 	int error;
314 
315 	KASSERT(mutex_owned(mtx));
316 
317 	cv_enter(cv, mtx, l);
318 	error = sleepq_block(timo, true);
319 	return cv_exit(cv, mtx, l, error);
320 }
321 
322 /*
323  * Given a number of seconds, sec, and 2^64ths of a second, frac, we
324  * want a number of ticks for a timeout:
325  *
326  *	timo = hz*(sec + frac/2^64)
327  *	     = hz*sec + hz*frac/2^64
328  *	     = hz*sec + hz*(frachi*2^32 + fraclo)/2^64
329  *	     = hz*sec + hz*frachi/2^32 + hz*fraclo/2^64,
330  *
331  * where frachi is the high 32 bits of frac and fraclo is the
332  * low 32 bits.
333  *
334  * We assume hz < INT_MAX/2 < UINT32_MAX, so
335  *
336  *	hz*fraclo/2^64 < fraclo*2^32/2^64 <= 1,
337  *
338  * since fraclo < 2^32.
339  *
340  * We clamp the result at INT_MAX/2 for a timeout in ticks, since we
341  * can't represent timeouts higher than INT_MAX in cv_timedwait, and
342  * spurious wakeup is OK.  Moreover, we don't want to wrap around,
343  * because we compute end - start in ticks in order to compute the
344  * remaining timeout, and that difference cannot wrap around, so we use
345  * a timeout less than INT_MAX.  Using INT_MAX/2 provides plenty of
346  * margin for paranoia and will exceed most waits in practice by far.
347  */
348 static unsigned
349 bintime2timo(const struct bintime *bt)
350 {
351 
352 	KASSERT(hz < INT_MAX/2);
353 	CTASSERT(INT_MAX/2 < UINT32_MAX);
354 	if (bt->sec > ((INT_MAX/2)/hz))
355 		return INT_MAX/2;
356 	if ((hz*(bt->frac >> 32) >> 32) > (INT_MAX/2 - hz*bt->sec))
357 		return INT_MAX/2;
358 
359 	return hz*bt->sec + (hz*(bt->frac >> 32) >> 32);
360 }
361 
362 /*
363  * timo is in units of ticks.  We want units of seconds and 2^64ths of
364  * a second.  We know hz = 1 sec/tick, and 2^64 = 1 sec/(2^64th of a
365  * second), from which we can conclude 2^64 / hz = 1 (2^64th of a
366  * second)/tick.  So for the fractional part, we compute
367  *
368  *	frac = rem * 2^64 / hz
369  *	     = ((rem * 2^32) / hz) * 2^32
370  *
371  * Using truncating integer division instead of real division will
372  * leave us with only about 32 bits of precision, which means about
373  * 1/4-nanosecond resolution, which is good enough for our purposes.
374  */
375 static struct bintime
376 timo2bintime(unsigned timo)
377 {
378 
379 	return (struct bintime) {
380 		.sec = timo / hz,
381 		.frac = (((uint64_t)(timo % hz) << 32)/hz << 32),
382 	};
383 }
384 
385 /*
386  * cv_timedwaitbt:
387  *
388  *	Wait on a condition variable until awoken or the specified
389  *	timeout expires.  Returns zero if awoken normally or
390  *	EWOULDBLOCK if the timeout expires.
391  *
392  *	On entry, bt is a timeout in bintime.  cv_timedwaitbt subtracts
393  *	the time slept, so on exit, bt is the time remaining after
394  *	sleeping, possibly negative if the complete time has elapsed.
395  *	No infinite timeout; use cv_wait_sig instead.
396  *
397  *	epsilon is a requested maximum error in timeout (excluding
398  *	spurious wakeups).  Currently not used, will be used in the
399  *	future to choose between low- and high-resolution timers.
400  *	Actual wakeup time will be somewhere in [t, t + max(e, r) + s)
401  *	where r is the finest resolution of clock available and s is
402  *	scheduling delays for scheduler overhead and competing threads.
403  *	Time is measured by the interrupt source implementing the
404  *	timeout, not by another timecounter.
405  */
406 int
407 cv_timedwaitbt(kcondvar_t *cv, kmutex_t *mtx, struct bintime *bt,
408     const struct bintime *epsilon __diagused)
409 {
410 	struct bintime slept;
411 	unsigned start, end;
412 	int error;
413 
414 	KASSERTMSG(bt->sec >= 0, "negative timeout");
415 	KASSERTMSG(epsilon != NULL, "specify maximum requested delay");
416 
417 	/*
418 	 * hardclock_ticks is technically int, but nothing special
419 	 * happens instead of overflow, so we assume two's-complement
420 	 * wraparound and just treat it as unsigned.
421 	 */
422 	start = hardclock_ticks;
423 	error = cv_timedwait(cv, mtx, bintime2timo(bt));
424 	end = hardclock_ticks;
425 
426 	slept = timo2bintime(end - start);
427 	/* bt := bt - slept */
428 	bintime_sub(bt, &slept);
429 
430 	return error;
431 }
432 
433 /*
434  * cv_timedwaitbt_sig:
435  *
436  *	Wait on a condition variable until awoken, the specified
437  *	timeout expires, or interrupted by a signal.  Returns zero if
438  *	awoken normally, EWOULDBLOCK if the timeout expires, or
439  *	EINTR/ERESTART if interrupted by a signal.
440  *
441  *	On entry, bt is a timeout in bintime.  cv_timedwaitbt_sig
442  *	subtracts the time slept, so on exit, bt is the time remaining
443  *	after sleeping.  No infinite timeout; use cv_wait instead.
444  *
445  *	epsilon is a requested maximum error in timeout (excluding
446  *	spurious wakeups).  Currently not used, will be used in the
447  *	future to choose between low- and high-resolution timers.
448  */
449 int
450 cv_timedwaitbt_sig(kcondvar_t *cv, kmutex_t *mtx, struct bintime *bt,
451     const struct bintime *epsilon __diagused)
452 {
453 	struct bintime slept;
454 	unsigned start, end;
455 	int error;
456 
457 	KASSERTMSG(bt->sec >= 0, "negative timeout");
458 	KASSERTMSG(epsilon != NULL, "specify maximum requested delay");
459 
460 	/*
461 	 * hardclock_ticks is technically int, but nothing special
462 	 * happens instead of overflow, so we assume two's-complement
463 	 * wraparound and just treat it as unsigned.
464 	 */
465 	start = hardclock_ticks;
466 	error = cv_timedwait_sig(cv, mtx, bintime2timo(bt));
467 	end = hardclock_ticks;
468 
469 	slept = timo2bintime(end - start);
470 	/* bt := bt - slept */
471 	bintime_sub(bt, &slept);
472 
473 	return error;
474 }
475 
476 /*
477  * cv_signal:
478  *
479  *	Wake the highest priority LWP waiting on a condition variable.
480  *	Must be called with the interlocking mutex held.
481  */
482 void
483 cv_signal(kcondvar_t *cv)
484 {
485 
486 	/* LOCKDEBUG_WAKEUP(CV_DEBUG_P(cv), cv, CV_RA); */
487 	KASSERT(cv_is_valid(cv));
488 
489 	if (__predict_false(!TAILQ_EMPTY(CV_SLEEPQ(cv))))
490 		cv_wakeup_one(cv);
491 }
492 
493 static inline void
494 cv_wakeup_one(kcondvar_t *cv)
495 {
496 	sleepq_t *sq;
497 	kmutex_t *mp;
498 	lwp_t *l;
499 
500 	KASSERT(cv_is_valid(cv));
501 
502 	mp = sleepq_hashlock(cv);
503 	sq = CV_SLEEPQ(cv);
504 	l = TAILQ_FIRST(sq);
505 	if (l == NULL) {
506 		mutex_spin_exit(mp);
507 		return;
508 	}
509 	KASSERT(l->l_sleepq == sq);
510 	KASSERT(l->l_mutex == mp);
511 	KASSERT(l->l_wchan == cv);
512 	CV_LOCKDEBUG_PROCESS(l, cv);
513 	sleepq_remove(sq, l);
514 	mutex_spin_exit(mp);
515 
516 	KASSERT(cv_is_valid(cv));
517 }
518 
519 /*
520  * cv_broadcast:
521  *
522  *	Wake all LWPs waiting on a condition variable.  Must be called
523  *	with the interlocking mutex held.
524  */
525 void
526 cv_broadcast(kcondvar_t *cv)
527 {
528 
529 	/* LOCKDEBUG_WAKEUP(CV_DEBUG_P(cv), cv, CV_RA); */
530 	KASSERT(cv_is_valid(cv));
531 
532 	if (__predict_false(!TAILQ_EMPTY(CV_SLEEPQ(cv))))
533 		cv_wakeup_all(cv);
534 }
535 
536 static inline void
537 cv_wakeup_all(kcondvar_t *cv)
538 {
539 	sleepq_t *sq;
540 	kmutex_t *mp;
541 	lwp_t *l, *next;
542 
543 	KASSERT(cv_is_valid(cv));
544 
545 	mp = sleepq_hashlock(cv);
546 	sq = CV_SLEEPQ(cv);
547 	for (l = TAILQ_FIRST(sq); l != NULL; l = next) {
548 		KASSERT(l->l_sleepq == sq);
549 		KASSERT(l->l_mutex == mp);
550 		KASSERT(l->l_wchan == cv);
551 		next = TAILQ_NEXT(l, l_sleepchain);
552 		CV_LOCKDEBUG_PROCESS(l, cv);
553 		sleepq_remove(sq, l);
554 	}
555 	mutex_spin_exit(mp);
556 
557 	KASSERT(cv_is_valid(cv));
558 }
559 
560 /*
561  * cv_has_waiters:
562  *
563  *	For diagnostic assertions: return non-zero if a condition
564  *	variable has waiters.
565  */
566 bool
567 cv_has_waiters(kcondvar_t *cv)
568 {
569 
570 	return !TAILQ_EMPTY(CV_SLEEPQ(cv));
571 }
572 
573 /*
574  * cv_is_valid:
575  *
576  *	For diagnostic assertions: return non-zero if a condition
577  *	variable appears to be valid.  No locks need be held.
578  */
579 bool
580 cv_is_valid(kcondvar_t *cv)
581 {
582 
583 	return CV_WMESG(cv) != deadcv && CV_WMESG(cv) != NULL;
584 }
585