xref: /netbsd-src/sys/kern/kern_rwlock.c (revision 59b15a84e1ccf6e6818dbc1717a75b462700147a)
1 /*	$NetBSD: kern_rwlock.c,v 1.76 2023/10/15 10:28:48 riastradh Exp $	*/
2 
3 /*-
4  * Copyright (c) 2002, 2006, 2007, 2008, 2009, 2019, 2020, 2023
5  *     The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Jason R. Thorpe and Andrew Doran.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /*
34  * Kernel reader/writer lock implementation, modeled after those
35  * found in Solaris, a description of which can be found in:
36  *
37  *	Solaris Internals: Core Kernel Architecture, Jim Mauro and
38  *	    Richard McDougall.
39  *
40  * The NetBSD implementation differs from that described in the book, in
41  * that the locks are partially adaptive.  Lock waiters spin wait while a
42  * lock is write held and the holder is still running on a CPU.  The method
43  * of choosing which threads to awaken when a lock is released also differs,
44  * mainly to take account of the partially adaptive behaviour.
45  */
46 
47 #include <sys/cdefs.h>
48 __KERNEL_RCSID(0, "$NetBSD: kern_rwlock.c,v 1.76 2023/10/15 10:28:48 riastradh Exp $");
49 
50 #include "opt_lockdebug.h"
51 
52 #define	__RWLOCK_PRIVATE
53 
54 #include <sys/param.h>
55 
56 #include <sys/atomic.h>
57 #include <sys/cpu.h>
58 #include <sys/lock.h>
59 #include <sys/lockdebug.h>
60 #include <sys/proc.h>
61 #include <sys/pserialize.h>
62 #include <sys/rwlock.h>
63 #include <sys/sched.h>
64 #include <sys/sleepq.h>
65 #include <sys/syncobj.h>
66 #include <sys/systm.h>
67 
68 #include <dev/lockstat.h>
69 
70 #include <machine/rwlock.h>
71 
72 /*
73  * LOCKDEBUG
74  */
75 
76 #define	RW_DEBUG_P(rw)		(((rw)->rw_owner & RW_NODEBUG) == 0)
77 
78 #define	RW_WANTLOCK(rw, op) \
79     LOCKDEBUG_WANTLOCK(RW_DEBUG_P(rw), (rw), \
80         (uintptr_t)__builtin_return_address(0), op == RW_READER);
81 #define	RW_LOCKED(rw, op) \
82     LOCKDEBUG_LOCKED(RW_DEBUG_P(rw), (rw), NULL, \
83         (uintptr_t)__builtin_return_address(0), op == RW_READER);
84 #define	RW_UNLOCKED(rw, op) \
85     LOCKDEBUG_UNLOCKED(RW_DEBUG_P(rw), (rw), \
86         (uintptr_t)__builtin_return_address(0), op == RW_READER);
87 
88 /*
89  * DIAGNOSTIC
90  */
91 
92 #if defined(DIAGNOSTIC)
93 #define	RW_ASSERT(rw, cond) \
94 do { \
95 	if (__predict_false(!(cond))) \
96 		rw_abort(__func__, __LINE__, rw, "assertion failed: " #cond);\
97 } while (/* CONSTCOND */ 0)
98 #else
99 #define	RW_ASSERT(rw, cond)	/* nothing */
100 #endif	/* DIAGNOSTIC */
101 
102 /*
103  * For platforms that do not provide stubs, or for the LOCKDEBUG case.
104  */
105 #ifdef LOCKDEBUG
106 #undef	__HAVE_RW_STUBS
107 #endif
108 
109 #ifndef __HAVE_RW_STUBS
110 __strong_alias(rw_enter,rw_vector_enter);
111 __strong_alias(rw_exit,rw_vector_exit);
112 __strong_alias(rw_tryenter,rw_vector_tryenter);
113 #endif
114 
115 static void	rw_abort(const char *, size_t, krwlock_t *, const char *);
116 static void	rw_dump(const volatile void *, lockop_printer_t);
117 static lwp_t	*rw_owner(wchan_t);
118 
119 lockops_t rwlock_lockops = {
120 	.lo_name = "Reader / writer lock",
121 	.lo_type = LOCKOPS_SLEEP,
122 	.lo_dump = rw_dump,
123 };
124 
125 /*
126  * Give rwlock holders an extra-high priority boost on-blocking due to
127  * direct handoff.  XXX To be revisited.
128  */
129 syncobj_t rw_syncobj = {
130 	.sobj_name	= "rwlock",
131 	.sobj_flag	= SOBJ_SLEEPQ_SORTED,
132 	.sobj_boostpri  = PRI_KTHREAD,
133 	.sobj_unsleep	= turnstile_unsleep,
134 	.sobj_changepri	= turnstile_changepri,
135 	.sobj_lendpri	= sleepq_lendpri,
136 	.sobj_owner	= rw_owner,
137 };
138 
139 /*
140  * rw_cas:
141  *
142  *	Do an atomic compare-and-swap on the lock word.
143  */
144 static inline uintptr_t
rw_cas(krwlock_t * rw,uintptr_t o,uintptr_t n)145 rw_cas(krwlock_t *rw, uintptr_t o, uintptr_t n)
146 {
147 
148 	return (uintptr_t)atomic_cas_ptr((volatile void *)&rw->rw_owner,
149 	    (void *)o, (void *)n);
150 }
151 
152 /*
153  * rw_swap:
154  *
155  *	Do an atomic swap of the lock word.  This is used only when it's
156  *	known that the lock word is set up such that it can't be changed
157  *	behind us (assert this), so there's no point considering the result.
158  */
159 static inline void
rw_swap(krwlock_t * rw,uintptr_t o,uintptr_t n)160 rw_swap(krwlock_t *rw, uintptr_t o, uintptr_t n)
161 {
162 
163 	n = (uintptr_t)atomic_swap_ptr((volatile void *)&rw->rw_owner,
164 	    (void *)n);
165 
166 	RW_ASSERT(rw, n == o);
167 	RW_ASSERT(rw, (o & RW_HAS_WAITERS) != 0);
168 }
169 
170 /*
171  * rw_dump:
172  *
173  *	Dump the contents of a rwlock structure.
174  */
175 static void
rw_dump(const volatile void * cookie,lockop_printer_t pr)176 rw_dump(const volatile void *cookie, lockop_printer_t pr)
177 {
178 	const volatile krwlock_t *rw = cookie;
179 
180 	pr("owner/count  : %#018lx flags    : %#018x\n",
181 	    (long)RW_OWNER(rw), (int)RW_FLAGS(rw));
182 }
183 
184 /*
185  * rw_abort:
186  *
187  *	Dump information about an error and panic the system.  This
188  *	generates a lot of machine code in the DIAGNOSTIC case, so
189  *	we ask the compiler to not inline it.
190  */
191 static void __noinline
rw_abort(const char * func,size_t line,krwlock_t * rw,const char * msg)192 rw_abort(const char *func, size_t line, krwlock_t *rw, const char *msg)
193 {
194 
195 	if (__predict_false(panicstr != NULL))
196 		return;
197 
198 	LOCKDEBUG_ABORT(func, line, rw, &rwlock_lockops, msg);
199 }
200 
201 /*
202  * rw_init:
203  *
204  *	Initialize a rwlock for use.
205  */
206 void
_rw_init(krwlock_t * rw,uintptr_t return_address)207 _rw_init(krwlock_t *rw, uintptr_t return_address)
208 {
209 
210 #ifdef LOCKDEBUG
211 	/* XXX only because the assembly stubs can't handle RW_NODEBUG */
212 	if (LOCKDEBUG_ALLOC(rw, &rwlock_lockops, return_address))
213 		rw->rw_owner = 0;
214 	else
215 		rw->rw_owner = RW_NODEBUG;
216 #else
217 	rw->rw_owner = 0;
218 #endif
219 }
220 
221 void
rw_init(krwlock_t * rw)222 rw_init(krwlock_t *rw)
223 {
224 
225 	_rw_init(rw, (uintptr_t)__builtin_return_address(0));
226 }
227 
228 /*
229  * rw_destroy:
230  *
231  *	Tear down a rwlock.
232  */
233 void
rw_destroy(krwlock_t * rw)234 rw_destroy(krwlock_t *rw)
235 {
236 
237 	RW_ASSERT(rw, (rw->rw_owner & ~RW_NODEBUG) == 0);
238 	LOCKDEBUG_FREE((rw->rw_owner & RW_NODEBUG) == 0, rw);
239 }
240 
241 /*
242  * rw_oncpu:
243  *
244  *	Return true if an rwlock owner is running on a CPU in the system.
245  *	If the target is waiting on the kernel big lock, then we must
246  *	release it.  This is necessary to avoid deadlock.
247  */
248 static bool
rw_oncpu(uintptr_t owner)249 rw_oncpu(uintptr_t owner)
250 {
251 #ifdef MULTIPROCESSOR
252 	struct cpu_info *ci;
253 	lwp_t *l;
254 
255 	KASSERT(kpreempt_disabled());
256 
257 	if ((owner & (RW_WRITE_LOCKED|RW_HAS_WAITERS)) != RW_WRITE_LOCKED) {
258 		return false;
259 	}
260 
261 	/*
262 	 * See lwp_dtor() why dereference of the LWP pointer is safe.
263 	 * We must have kernel preemption disabled for that.
264 	 */
265 	l = (lwp_t *)(owner & RW_THREAD);
266 	ci = l->l_cpu;
267 
268 	if (ci && ci->ci_curlwp == l) {
269 		/* Target is running; do we need to block? */
270 		return (ci->ci_biglock_wanted != l);
271 	}
272 #endif
273 	/* Not running.  It may be safe to block now. */
274 	return false;
275 }
276 
277 /*
278  * rw_vector_enter:
279  *
280  *	Acquire a rwlock.
281  */
282 void
rw_vector_enter(krwlock_t * rw,const krw_t op)283 rw_vector_enter(krwlock_t *rw, const krw_t op)
284 {
285 	uintptr_t owner, incr, need_wait, set_wait, curthread, next;
286 	turnstile_t *ts;
287 	int queue;
288 	lwp_t *l;
289 	LOCKSTAT_TIMER(slptime);
290 	LOCKSTAT_TIMER(slpcnt);
291 	LOCKSTAT_TIMER(spintime);
292 	LOCKSTAT_COUNTER(spincnt);
293 	LOCKSTAT_FLAG(lsflag);
294 
295 	l = curlwp;
296 	curthread = (uintptr_t)l;
297 
298 	RW_ASSERT(rw, !cpu_intr_p());
299 	RW_ASSERT(rw, curthread != 0);
300 	RW_WANTLOCK(rw, op);
301 
302 	if (__predict_true(panicstr == NULL)) {
303 		KDASSERT(pserialize_not_in_read_section());
304 		LOCKDEBUG_BARRIER(&kernel_lock, 1);
305 	}
306 
307 	/*
308 	 * We play a slight trick here.  If we're a reader, we want
309 	 * increment the read count.  If we're a writer, we want to
310 	 * set the owner field and the WRITE_LOCKED bit.
311 	 *
312 	 * In the latter case, we expect those bits to be zero,
313 	 * therefore we can use an add operation to set them, which
314 	 * means an add operation for both cases.
315 	 */
316 	if (__predict_true(op == RW_READER)) {
317 		incr = RW_READ_INCR;
318 		set_wait = RW_HAS_WAITERS;
319 		need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED;
320 		queue = TS_READER_Q;
321 	} else {
322 		RW_ASSERT(rw, op == RW_WRITER);
323 		incr = curthread | RW_WRITE_LOCKED;
324 		set_wait = RW_HAS_WAITERS | RW_WRITE_WANTED;
325 		need_wait = RW_WRITE_LOCKED | RW_THREAD;
326 		queue = TS_WRITER_Q;
327 	}
328 
329 	LOCKSTAT_ENTER(lsflag);
330 
331 	KPREEMPT_DISABLE(curlwp);
332 	for (owner = rw->rw_owner;;) {
333 		/*
334 		 * Read the lock owner field.  If the need-to-wait
335 		 * indicator is clear, then try to acquire the lock.
336 		 */
337 		if ((owner & need_wait) == 0) {
338 			next = rw_cas(rw, owner, (owner + incr) &
339 			    ~RW_WRITE_WANTED);
340 			if (__predict_true(next == owner)) {
341 				/* Got it! */
342 				membar_acquire();
343 				break;
344 			}
345 
346 			/*
347 			 * Didn't get it -- spin around again (we'll
348 			 * probably sleep on the next iteration).
349 			 */
350 			owner = next;
351 			continue;
352 		}
353 		if (__predict_false(RW_OWNER(rw) == curthread)) {
354 			rw_abort(__func__, __LINE__, rw,
355 			    "locking against myself");
356 		}
357 		/*
358 		 * If the lock owner is running on another CPU, and
359 		 * there are no existing waiters, then spin.
360 		 */
361 		if (rw_oncpu(owner)) {
362 			LOCKSTAT_START_TIMER(lsflag, spintime);
363 			u_int count = SPINLOCK_BACKOFF_MIN;
364 			do {
365 				KPREEMPT_ENABLE(curlwp);
366 				SPINLOCK_BACKOFF(count);
367 				KPREEMPT_DISABLE(curlwp);
368 				owner = rw->rw_owner;
369 			} while (rw_oncpu(owner));
370 			LOCKSTAT_STOP_TIMER(lsflag, spintime);
371 			LOCKSTAT_COUNT(spincnt, 1);
372 			if ((owner & need_wait) == 0)
373 				continue;
374 		}
375 
376 		/*
377 		 * Grab the turnstile chain lock.  Once we have that, we
378 		 * can adjust the waiter bits and sleep queue.
379 		 */
380 		ts = turnstile_lookup(rw);
381 
382 		/*
383 		 * Mark the rwlock as having waiters.  If the set fails,
384 		 * then we may not need to sleep and should spin again.
385 		 * Reload rw_owner because turnstile_lookup() may have
386 		 * spun on the turnstile chain lock.
387 		 */
388 		owner = rw->rw_owner;
389 		if ((owner & need_wait) == 0 || rw_oncpu(owner)) {
390 			turnstile_exit(rw);
391 			continue;
392 		}
393 		next = rw_cas(rw, owner, owner | set_wait);
394 		/* XXX membar? */
395 		if (__predict_false(next != owner)) {
396 			turnstile_exit(rw);
397 			owner = next;
398 			continue;
399 		}
400 
401 		LOCKSTAT_START_TIMER(lsflag, slptime);
402 		turnstile_block(ts, queue, rw, &rw_syncobj);
403 		LOCKSTAT_STOP_TIMER(lsflag, slptime);
404 		LOCKSTAT_COUNT(slpcnt, 1);
405 
406 		/*
407 		 * No need for a memory barrier because of context switch.
408 		 * If not handed the lock, then spin again.
409 		 */
410 		if (op == RW_READER || (rw->rw_owner & RW_THREAD) == curthread)
411 			break;
412 
413 		owner = rw->rw_owner;
414 	}
415 	KPREEMPT_ENABLE(curlwp);
416 
417 	LOCKSTAT_EVENT_RA(lsflag, rw, LB_RWLOCK |
418 	    (op == RW_WRITER ? LB_SLEEP1 : LB_SLEEP2), slpcnt, slptime,
419 	    (l->l_rwcallsite != 0 ? l->l_rwcallsite :
420 	      (uintptr_t)__builtin_return_address(0)));
421 	LOCKSTAT_EVENT_RA(lsflag, rw, LB_RWLOCK | LB_SPIN, spincnt, spintime,
422 	    (l->l_rwcallsite != 0 ? l->l_rwcallsite :
423 	      (uintptr_t)__builtin_return_address(0)));
424 	LOCKSTAT_EXIT(lsflag);
425 
426 	RW_ASSERT(rw, (op != RW_READER && RW_OWNER(rw) == curthread) ||
427 	    (op == RW_READER && RW_COUNT(rw) != 0));
428 	RW_LOCKED(rw, op);
429 }
430 
431 /*
432  * rw_vector_exit:
433  *
434  *	Release a rwlock.
435  */
436 void
rw_vector_exit(krwlock_t * rw)437 rw_vector_exit(krwlock_t *rw)
438 {
439 	uintptr_t curthread, owner, decr, newown, next;
440 	turnstile_t *ts;
441 	int rcnt, wcnt;
442 	lwp_t *l;
443 
444 	l = curlwp;
445 	curthread = (uintptr_t)l;
446 	RW_ASSERT(rw, curthread != 0);
447 
448 	/*
449 	 * Again, we use a trick.  Since we used an add operation to
450 	 * set the required lock bits, we can use a subtract to clear
451 	 * them, which makes the read-release and write-release path
452 	 * the same.
453 	 */
454 	owner = rw->rw_owner;
455 	if (__predict_false((owner & RW_WRITE_LOCKED) != 0)) {
456 		RW_UNLOCKED(rw, RW_WRITER);
457 		RW_ASSERT(rw, RW_OWNER(rw) == curthread);
458 		decr = curthread | RW_WRITE_LOCKED;
459 	} else {
460 		RW_UNLOCKED(rw, RW_READER);
461 		RW_ASSERT(rw, RW_COUNT(rw) != 0);
462 		decr = RW_READ_INCR;
463 	}
464 
465 	/*
466 	 * Compute what we expect the new value of the lock to be. Only
467 	 * proceed to do direct handoff if there are waiters, and if the
468 	 * lock would become unowned.
469 	 */
470 	membar_release();
471 	for (;;) {
472 		newown = (owner - decr);
473 		if ((newown & (RW_THREAD | RW_HAS_WAITERS)) == RW_HAS_WAITERS)
474 			break;
475 		next = rw_cas(rw, owner, newown);
476 		if (__predict_true(next == owner))
477 			return;
478 		owner = next;
479 	}
480 
481 	/*
482 	 * Grab the turnstile chain lock.  This gets the interlock
483 	 * on the sleep queue.  Once we have that, we can adjust the
484 	 * waiter bits.
485 	 */
486 	ts = turnstile_lookup(rw);
487 	owner = rw->rw_owner;
488 	RW_ASSERT(rw, ts != NULL);
489 	RW_ASSERT(rw, (owner & RW_HAS_WAITERS) != 0);
490 
491 	wcnt = TS_WAITERS(ts, TS_WRITER_Q);
492 	rcnt = TS_WAITERS(ts, TS_READER_Q);
493 
494 	/*
495 	 * Give the lock away.
496 	 *
497 	 * If we are releasing a write lock, then prefer to wake all
498 	 * outstanding readers.  Otherwise, wake one writer if there
499 	 * are outstanding readers, or all writers if there are no
500 	 * pending readers.  If waking one specific writer, the writer
501 	 * is handed the lock here.  If waking multiple writers, we
502 	 * set WRITE_WANTED to block out new readers, and let them
503 	 * do the work of acquiring the lock in rw_vector_enter().
504 	 */
505 	if (rcnt == 0 || decr == RW_READ_INCR) {
506 		RW_ASSERT(rw, wcnt != 0);
507 		RW_ASSERT(rw, (owner & RW_WRITE_WANTED) != 0);
508 
509 		if (rcnt != 0) {
510 			/* Give the lock to the longest waiting writer. */
511 			l = TS_FIRST(ts, TS_WRITER_Q);
512 			newown = (uintptr_t)l | (owner & RW_NODEBUG);
513 			newown |= RW_WRITE_LOCKED | RW_HAS_WAITERS;
514 			if (wcnt > 1)
515 				newown |= RW_WRITE_WANTED;
516 			rw_swap(rw, owner, newown);
517 			turnstile_wakeup(ts, TS_WRITER_Q, 1, l);
518 		} else {
519 			/* Wake all writers and let them fight it out. */
520 			newown = owner & RW_NODEBUG;
521 			newown |= RW_WRITE_WANTED;
522 			rw_swap(rw, owner, newown);
523 			turnstile_wakeup(ts, TS_WRITER_Q, wcnt, NULL);
524 		}
525 	} else {
526 		RW_ASSERT(rw, rcnt != 0);
527 
528 		/*
529 		 * Give the lock to all blocked readers.  If there
530 		 * is a writer waiting, new readers that arrive
531 		 * after the release will be blocked out.
532 		 */
533 		newown = owner & RW_NODEBUG;
534 		newown += rcnt << RW_READ_COUNT_SHIFT;
535 		if (wcnt != 0)
536 			newown |= RW_HAS_WAITERS | RW_WRITE_WANTED;
537 
538 		/* Wake up all sleeping readers. */
539 		rw_swap(rw, owner, newown);
540 		turnstile_wakeup(ts, TS_READER_Q, rcnt, NULL);
541 	}
542 }
543 
544 /*
545  * rw_vector_tryenter:
546  *
547  *	Try to acquire a rwlock.
548  */
549 int
rw_vector_tryenter(krwlock_t * rw,const krw_t op)550 rw_vector_tryenter(krwlock_t *rw, const krw_t op)
551 {
552 	uintptr_t curthread, owner, incr, need_wait, next;
553 	lwp_t *l;
554 
555 	l = curlwp;
556 	curthread = (uintptr_t)l;
557 
558 	RW_ASSERT(rw, curthread != 0);
559 
560 	if (op == RW_READER) {
561 		incr = RW_READ_INCR;
562 		need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED;
563 	} else {
564 		RW_ASSERT(rw, op == RW_WRITER);
565 		incr = curthread | RW_WRITE_LOCKED;
566 		need_wait = RW_WRITE_LOCKED | RW_THREAD;
567 	}
568 
569 	for (owner = rw->rw_owner;; owner = next) {
570 		if (__predict_false((owner & need_wait) != 0))
571 			return 0;
572 		next = rw_cas(rw, owner, owner + incr);
573 		if (__predict_true(next == owner)) {
574 			/* Got it! */
575 			break;
576 		}
577 	}
578 
579 	RW_WANTLOCK(rw, op);
580 	RW_LOCKED(rw, op);
581 	RW_ASSERT(rw, (op != RW_READER && RW_OWNER(rw) == curthread) ||
582 	    (op == RW_READER && RW_COUNT(rw) != 0));
583 
584 	membar_acquire();
585 	return 1;
586 }
587 
588 /*
589  * rw_downgrade:
590  *
591  *	Downgrade a write lock to a read lock.
592  */
593 void
rw_downgrade(krwlock_t * rw)594 rw_downgrade(krwlock_t *rw)
595 {
596 	uintptr_t owner, newown, next, curthread __diagused;
597 	turnstile_t *ts;
598 	int rcnt, wcnt;
599 	lwp_t *l;
600 
601 	l = curlwp;
602 	curthread = (uintptr_t)l;
603 	RW_ASSERT(rw, curthread != 0);
604 	RW_ASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) != 0);
605 	RW_ASSERT(rw, RW_OWNER(rw) == curthread);
606 	RW_UNLOCKED(rw, RW_WRITER);
607 
608 	membar_release();
609 	for (owner = rw->rw_owner;; owner = next) {
610 		/*
611 		 * If there are no waiters we can do this the easy way.  Try
612 		 * swapping us down to one read hold.  If it fails, the lock
613 		 * condition has changed and we most likely now have
614 		 * waiters.
615 		 */
616 		if ((owner & RW_HAS_WAITERS) == 0) {
617 			newown = (owner & RW_NODEBUG);
618 			next = rw_cas(rw, owner, newown + RW_READ_INCR);
619 			if (__predict_true(next == owner)) {
620 				RW_LOCKED(rw, RW_READER);
621 				RW_ASSERT(rw,
622 				    (rw->rw_owner & RW_WRITE_LOCKED) == 0);
623 				RW_ASSERT(rw, RW_COUNT(rw) != 0);
624 				return;
625 			}
626 			continue;
627 		}
628 
629 		/*
630 		 * Grab the turnstile chain lock.  This gets the interlock
631 		 * on the sleep queue.  Once we have that, we can adjust the
632 		 * waiter bits.
633 		 */
634 		ts = turnstile_lookup(rw);
635 		RW_ASSERT(rw, ts != NULL);
636 
637 		rcnt = TS_WAITERS(ts, TS_READER_Q);
638 		wcnt = TS_WAITERS(ts, TS_WRITER_Q);
639 
640 		if (rcnt == 0) {
641 			/*
642 			 * If there are no readers, just preserve the
643 			 * waiters bits, swap us down to one read hold and
644 			 * return.
645 			 */
646 			RW_ASSERT(rw, wcnt != 0);
647 			RW_ASSERT(rw, (rw->rw_owner & RW_WRITE_WANTED) != 0);
648 			RW_ASSERT(rw, (rw->rw_owner & RW_HAS_WAITERS) != 0);
649 
650 			newown = owner & RW_NODEBUG;
651 			newown |= RW_READ_INCR | RW_HAS_WAITERS |
652 			    RW_WRITE_WANTED;
653 			next = rw_cas(rw, owner, newown);
654 			turnstile_exit(rw);
655 			if (__predict_true(next == owner))
656 				break;
657 		} else {
658 			/*
659 			 * Give the lock to all blocked readers.  We may
660 			 * retain one read hold if downgrading.  If there is
661 			 * a writer waiting, new readers will be blocked
662 			 * out.
663 			 */
664 			newown = owner & RW_NODEBUG;
665 			newown += (rcnt << RW_READ_COUNT_SHIFT) + RW_READ_INCR;
666 			if (wcnt != 0)
667 				newown |= RW_HAS_WAITERS | RW_WRITE_WANTED;
668 
669 			next = rw_cas(rw, owner, newown);
670 			if (__predict_true(next == owner)) {
671 				/* Wake up all sleeping readers. */
672 				turnstile_wakeup(ts, TS_READER_Q, rcnt, NULL);
673 				break;
674 			}
675 			turnstile_exit(rw);
676 		}
677 	}
678 
679 	RW_WANTLOCK(rw, RW_READER);
680 	RW_LOCKED(rw, RW_READER);
681 	RW_ASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) == 0);
682 	RW_ASSERT(rw, RW_COUNT(rw) != 0);
683 }
684 
685 /*
686  * rw_tryupgrade:
687  *
688  *	Try to upgrade a read lock to a write lock.  We must be the only
689  *	reader.
690  */
691 int
rw_tryupgrade(krwlock_t * rw)692 rw_tryupgrade(krwlock_t *rw)
693 {
694 	uintptr_t owner, curthread, newown, next;
695 	struct lwp *l;
696 
697 	l = curlwp;
698 	curthread = (uintptr_t)l;
699 	RW_ASSERT(rw, curthread != 0);
700 	RW_ASSERT(rw, rw_read_held(rw));
701 
702 	for (owner = RW_READ_INCR;; owner = next) {
703 		newown = curthread | RW_WRITE_LOCKED | (owner & ~RW_THREAD);
704 		next = rw_cas(rw, owner, newown);
705 		if (__predict_true(next == owner)) {
706 			membar_acquire();
707 			break;
708 		}
709 		RW_ASSERT(rw, (next & RW_WRITE_LOCKED) == 0);
710 		if (__predict_false((next & RW_THREAD) != RW_READ_INCR)) {
711 			RW_ASSERT(rw, (next & RW_THREAD) != 0);
712 			return 0;
713 		}
714 	}
715 
716 	RW_UNLOCKED(rw, RW_READER);
717 	RW_WANTLOCK(rw, RW_WRITER);
718 	RW_LOCKED(rw, RW_WRITER);
719 	RW_ASSERT(rw, rw->rw_owner & RW_WRITE_LOCKED);
720 	RW_ASSERT(rw, RW_OWNER(rw) == curthread);
721 
722 	return 1;
723 }
724 
725 /*
726  * rw_read_held:
727  *
728  *	Returns true if the rwlock is held for reading.  Must only be
729  *	used for diagnostic assertions, and never be used to make
730  * 	decisions about how to use a rwlock.
731  */
732 int
rw_read_held(krwlock_t * rw)733 rw_read_held(krwlock_t *rw)
734 {
735 	uintptr_t owner;
736 
737 	if (rw == NULL)
738 		return 0;
739 	owner = rw->rw_owner;
740 	return (owner & RW_WRITE_LOCKED) == 0 && (owner & RW_THREAD) != 0;
741 }
742 
743 /*
744  * rw_write_held:
745  *
746  *	Returns true if the rwlock is held for writing.  Must only be
747  *	used for diagnostic assertions, and never be used to make
748  *	decisions about how to use a rwlock.
749  */
750 int
rw_write_held(krwlock_t * rw)751 rw_write_held(krwlock_t *rw)
752 {
753 
754 	if (rw == NULL)
755 		return 0;
756 	return (rw->rw_owner & (RW_WRITE_LOCKED | RW_THREAD)) ==
757 	    (RW_WRITE_LOCKED | (uintptr_t)curlwp);
758 }
759 
760 /*
761  * rw_lock_held:
762  *
763  *	Returns true if the rwlock is held for reading or writing.  Must
764  *	only be used for diagnostic assertions, and never be used to make
765  *	decisions about how to use a rwlock.
766  */
767 int
rw_lock_held(krwlock_t * rw)768 rw_lock_held(krwlock_t *rw)
769 {
770 
771 	if (rw == NULL)
772 		return 0;
773 	return (rw->rw_owner & RW_THREAD) != 0;
774 }
775 
776 /*
777  * rw_lock_op:
778  *
779  *	For a rwlock that is known to be held by the caller, return
780  *	RW_READER or RW_WRITER to describe the hold type.
781  */
782 krw_t
rw_lock_op(krwlock_t * rw)783 rw_lock_op(krwlock_t *rw)
784 {
785 
786 	RW_ASSERT(rw, rw_lock_held(rw));
787 
788 	return (rw->rw_owner & RW_WRITE_LOCKED) != 0 ? RW_WRITER : RW_READER;
789 }
790 
791 /*
792  * rw_owner:
793  *
794  *	Return the current owner of an RW lock, but only if it is write
795  *	held.  Used for priority inheritance.
796  */
797 static lwp_t *
rw_owner(wchan_t obj)798 rw_owner(wchan_t obj)
799 {
800 	krwlock_t *rw = (void *)(uintptr_t)obj; /* discard qualifiers */
801 	uintptr_t owner = rw->rw_owner;
802 
803 	if ((owner & RW_WRITE_LOCKED) == 0)
804 		return NULL;
805 
806 	return (void *)(owner & RW_THREAD);
807 }
808