xref: /netbsd-src/sys/kern/kern_rwlock.c (revision afab4e300d3a9fb07dd8c80daf53d0feb3345706)
1 /*	$NetBSD: kern_rwlock.c,v 1.70 2023/02/24 11:11:10 riastradh Exp $	*/
2 
3 /*-
4  * Copyright (c) 2002, 2006, 2007, 2008, 2009, 2019, 2020
5  *     The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Jason R. Thorpe and Andrew Doran.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /*
34  * Kernel reader/writer lock implementation, modeled after those
35  * found in Solaris, a description of which can be found in:
36  *
37  *	Solaris Internals: Core Kernel Architecture, Jim Mauro and
38  *	    Richard McDougall.
39  *
40  * The NetBSD implementation differs from that described in the book, in
41  * that the locks are partially adaptive.  Lock waiters spin wait while a
42  * lock is write held and the holder is still running on a CPU.  The method
43  * of choosing which threads to awaken when a lock is released also differs,
44  * mainly to take account of the partially adaptive behaviour.
45  */
46 
47 #include <sys/cdefs.h>
48 __KERNEL_RCSID(0, "$NetBSD: kern_rwlock.c,v 1.70 2023/02/24 11:11:10 riastradh Exp $");
49 
50 #include "opt_lockdebug.h"
51 
52 #define	__RWLOCK_PRIVATE
53 
54 #include <sys/param.h>
55 #include <sys/proc.h>
56 #include <sys/rwlock.h>
57 #include <sys/sched.h>
58 #include <sys/sleepq.h>
59 #include <sys/systm.h>
60 #include <sys/lockdebug.h>
61 #include <sys/cpu.h>
62 #include <sys/atomic.h>
63 #include <sys/lock.h>
64 #include <sys/pserialize.h>
65 
66 #include <dev/lockstat.h>
67 
68 #include <machine/rwlock.h>
69 
70 /*
71  * LOCKDEBUG
72  */
73 
74 #define	RW_DEBUG_P(rw)		(((rw)->rw_owner & RW_NODEBUG) == 0)
75 
76 #define	RW_WANTLOCK(rw, op) \
77     LOCKDEBUG_WANTLOCK(RW_DEBUG_P(rw), (rw), \
78         (uintptr_t)__builtin_return_address(0), op == RW_READER);
79 #define	RW_LOCKED(rw, op) \
80     LOCKDEBUG_LOCKED(RW_DEBUG_P(rw), (rw), NULL, \
81         (uintptr_t)__builtin_return_address(0), op == RW_READER);
82 #define	RW_UNLOCKED(rw, op) \
83     LOCKDEBUG_UNLOCKED(RW_DEBUG_P(rw), (rw), \
84         (uintptr_t)__builtin_return_address(0), op == RW_READER);
85 
86 /*
87  * DIAGNOSTIC
88  */
89 
90 #if defined(DIAGNOSTIC)
91 #define	RW_ASSERT(rw, cond) \
92 do { \
93 	if (__predict_false(!(cond))) \
94 		rw_abort(__func__, __LINE__, rw, "assertion failed: " #cond);\
95 } while (/* CONSTCOND */ 0)
96 #else
97 #define	RW_ASSERT(rw, cond)	/* nothing */
98 #endif	/* DIAGNOSTIC */
99 
100 /*
101  * For platforms that do not provide stubs, or for the LOCKDEBUG case.
102  */
103 #ifdef LOCKDEBUG
104 #undef	__HAVE_RW_STUBS
105 #endif
106 
107 #ifndef __HAVE_RW_STUBS
108 __strong_alias(rw_enter,rw_vector_enter);
109 __strong_alias(rw_exit,rw_vector_exit);
110 __strong_alias(rw_tryenter,rw_vector_tryenter);
111 #endif
112 
113 static void	rw_abort(const char *, size_t, krwlock_t *, const char *);
114 static void	rw_dump(const volatile void *, lockop_printer_t);
115 static lwp_t	*rw_owner(wchan_t);
116 
117 lockops_t rwlock_lockops = {
118 	.lo_name = "Reader / writer lock",
119 	.lo_type = LOCKOPS_SLEEP,
120 	.lo_dump = rw_dump,
121 };
122 
123 syncobj_t rw_syncobj = {
124 	.sobj_flag	= SOBJ_SLEEPQ_SORTED,
125 	.sobj_unsleep	= turnstile_unsleep,
126 	.sobj_changepri	= turnstile_changepri,
127 	.sobj_lendpri	= sleepq_lendpri,
128 	.sobj_owner	= rw_owner,
129 };
130 
131 /*
132  * rw_cas:
133  *
134  *	Do an atomic compare-and-swap on the lock word.
135  */
136 static inline uintptr_t
137 rw_cas(krwlock_t *rw, uintptr_t o, uintptr_t n)
138 {
139 
140 	return (uintptr_t)atomic_cas_ptr((volatile void *)&rw->rw_owner,
141 	    (void *)o, (void *)n);
142 }
143 
144 /*
145  * rw_swap:
146  *
147  *	Do an atomic swap of the lock word.  This is used only when it's
148  *	known that the lock word is set up such that it can't be changed
149  *	behind us (assert this), so there's no point considering the result.
150  */
151 static inline void
152 rw_swap(krwlock_t *rw, uintptr_t o, uintptr_t n)
153 {
154 
155 	n = (uintptr_t)atomic_swap_ptr((volatile void *)&rw->rw_owner,
156 	    (void *)n);
157 
158 	RW_ASSERT(rw, n == o);
159 	RW_ASSERT(rw, (o & RW_HAS_WAITERS) != 0);
160 }
161 
162 /*
163  * rw_dump:
164  *
165  *	Dump the contents of a rwlock structure.
166  */
167 static void
168 rw_dump(const volatile void *cookie, lockop_printer_t pr)
169 {
170 	const volatile krwlock_t *rw = cookie;
171 
172 	pr("owner/count  : %#018lx flags    : %#018x\n",
173 	    (long)RW_OWNER(rw), (int)RW_FLAGS(rw));
174 }
175 
176 /*
177  * rw_abort:
178  *
179  *	Dump information about an error and panic the system.  This
180  *	generates a lot of machine code in the DIAGNOSTIC case, so
181  *	we ask the compiler to not inline it.
182  */
183 static void __noinline
184 rw_abort(const char *func, size_t line, krwlock_t *rw, const char *msg)
185 {
186 
187 	if (__predict_false(panicstr != NULL))
188 		return;
189 
190 	LOCKDEBUG_ABORT(func, line, rw, &rwlock_lockops, msg);
191 }
192 
193 /*
194  * rw_init:
195  *
196  *	Initialize a rwlock for use.
197  */
198 void
199 _rw_init(krwlock_t *rw, uintptr_t return_address)
200 {
201 
202 #ifdef LOCKDEBUG
203 	/* XXX only because the assembly stubs can't handle RW_NODEBUG */
204 	if (LOCKDEBUG_ALLOC(rw, &rwlock_lockops, return_address))
205 		rw->rw_owner = 0;
206 	else
207 		rw->rw_owner = RW_NODEBUG;
208 #else
209 	rw->rw_owner = 0;
210 #endif
211 }
212 
213 void
214 rw_init(krwlock_t *rw)
215 {
216 
217 	_rw_init(rw, (uintptr_t)__builtin_return_address(0));
218 }
219 
220 /*
221  * rw_destroy:
222  *
223  *	Tear down a rwlock.
224  */
225 void
226 rw_destroy(krwlock_t *rw)
227 {
228 
229 	RW_ASSERT(rw, (rw->rw_owner & ~RW_NODEBUG) == 0);
230 	LOCKDEBUG_FREE((rw->rw_owner & RW_NODEBUG) == 0, rw);
231 }
232 
233 /*
234  * rw_oncpu:
235  *
236  *	Return true if an rwlock owner is running on a CPU in the system.
237  *	If the target is waiting on the kernel big lock, then we must
238  *	release it.  This is necessary to avoid deadlock.
239  */
240 static bool
241 rw_oncpu(uintptr_t owner)
242 {
243 #ifdef MULTIPROCESSOR
244 	struct cpu_info *ci;
245 	lwp_t *l;
246 
247 	KASSERT(kpreempt_disabled());
248 
249 	if ((owner & (RW_WRITE_LOCKED|RW_HAS_WAITERS)) != RW_WRITE_LOCKED) {
250 		return false;
251 	}
252 
253 	/*
254 	 * See lwp_dtor() why dereference of the LWP pointer is safe.
255 	 * We must have kernel preemption disabled for that.
256 	 */
257 	l = (lwp_t *)(owner & RW_THREAD);
258 	ci = l->l_cpu;
259 
260 	if (ci && ci->ci_curlwp == l) {
261 		/* Target is running; do we need to block? */
262 		return (ci->ci_biglock_wanted != l);
263 	}
264 #endif
265 	/* Not running.  It may be safe to block now. */
266 	return false;
267 }
268 
269 /*
270  * rw_vector_enter:
271  *
272  *	Acquire a rwlock.
273  */
274 void
275 rw_vector_enter(krwlock_t *rw, const krw_t op)
276 {
277 	uintptr_t owner, incr, need_wait, set_wait, curthread, next;
278 	turnstile_t *ts;
279 	int queue;
280 	lwp_t *l;
281 	LOCKSTAT_TIMER(slptime);
282 	LOCKSTAT_TIMER(slpcnt);
283 	LOCKSTAT_TIMER(spintime);
284 	LOCKSTAT_COUNTER(spincnt);
285 	LOCKSTAT_FLAG(lsflag);
286 
287 	l = curlwp;
288 	curthread = (uintptr_t)l;
289 
290 	RW_ASSERT(rw, !cpu_intr_p());
291 	RW_ASSERT(rw, curthread != 0);
292 	RW_WANTLOCK(rw, op);
293 
294 	if (__predict_true(panicstr == NULL)) {
295 		KDASSERT(pserialize_not_in_read_section());
296 		LOCKDEBUG_BARRIER(&kernel_lock, 1);
297 	}
298 
299 	/*
300 	 * We play a slight trick here.  If we're a reader, we want
301 	 * increment the read count.  If we're a writer, we want to
302 	 * set the owner field and the WRITE_LOCKED bit.
303 	 *
304 	 * In the latter case, we expect those bits to be zero,
305 	 * therefore we can use an add operation to set them, which
306 	 * means an add operation for both cases.
307 	 */
308 	if (__predict_true(op == RW_READER)) {
309 		incr = RW_READ_INCR;
310 		set_wait = RW_HAS_WAITERS;
311 		need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED;
312 		queue = TS_READER_Q;
313 	} else {
314 		RW_ASSERT(rw, op == RW_WRITER);
315 		incr = curthread | RW_WRITE_LOCKED;
316 		set_wait = RW_HAS_WAITERS | RW_WRITE_WANTED;
317 		need_wait = RW_WRITE_LOCKED | RW_THREAD;
318 		queue = TS_WRITER_Q;
319 	}
320 
321 	LOCKSTAT_ENTER(lsflag);
322 
323 	KPREEMPT_DISABLE(curlwp);
324 	for (owner = rw->rw_owner;;) {
325 		/*
326 		 * Read the lock owner field.  If the need-to-wait
327 		 * indicator is clear, then try to acquire the lock.
328 		 */
329 		if ((owner & need_wait) == 0) {
330 			next = rw_cas(rw, owner, (owner + incr) &
331 			    ~RW_WRITE_WANTED);
332 			if (__predict_true(next == owner)) {
333 				/* Got it! */
334 				membar_acquire();
335 				break;
336 			}
337 
338 			/*
339 			 * Didn't get it -- spin around again (we'll
340 			 * probably sleep on the next iteration).
341 			 */
342 			owner = next;
343 			continue;
344 		}
345 		if (__predict_false(RW_OWNER(rw) == curthread)) {
346 			rw_abort(__func__, __LINE__, rw,
347 			    "locking against myself");
348 		}
349 		/*
350 		 * If the lock owner is running on another CPU, and
351 		 * there are no existing waiters, then spin.
352 		 */
353 		if (rw_oncpu(owner)) {
354 			LOCKSTAT_START_TIMER(lsflag, spintime);
355 			u_int count = SPINLOCK_BACKOFF_MIN;
356 			do {
357 				KPREEMPT_ENABLE(curlwp);
358 				SPINLOCK_BACKOFF(count);
359 				KPREEMPT_DISABLE(curlwp);
360 				owner = rw->rw_owner;
361 			} while (rw_oncpu(owner));
362 			LOCKSTAT_STOP_TIMER(lsflag, spintime);
363 			LOCKSTAT_COUNT(spincnt, 1);
364 			if ((owner & need_wait) == 0)
365 				continue;
366 		}
367 
368 		/*
369 		 * Grab the turnstile chain lock.  Once we have that, we
370 		 * can adjust the waiter bits and sleep queue.
371 		 */
372 		ts = turnstile_lookup(rw);
373 
374 		/*
375 		 * Mark the rwlock as having waiters.  If the set fails,
376 		 * then we may not need to sleep and should spin again.
377 		 * Reload rw_owner because turnstile_lookup() may have
378 		 * spun on the turnstile chain lock.
379 		 */
380 		owner = rw->rw_owner;
381 		if ((owner & need_wait) == 0 || rw_oncpu(owner)) {
382 			turnstile_exit(rw);
383 			continue;
384 		}
385 		next = rw_cas(rw, owner, owner | set_wait);
386 		/* XXX membar? */
387 		if (__predict_false(next != owner)) {
388 			turnstile_exit(rw);
389 			owner = next;
390 			continue;
391 		}
392 
393 		LOCKSTAT_START_TIMER(lsflag, slptime);
394 		turnstile_block(ts, queue, rw, &rw_syncobj);
395 		LOCKSTAT_STOP_TIMER(lsflag, slptime);
396 		LOCKSTAT_COUNT(slpcnt, 1);
397 
398 		/*
399 		 * No need for a memory barrier because of context switch.
400 		 * If not handed the lock, then spin again.
401 		 */
402 		if (op == RW_READER || (rw->rw_owner & RW_THREAD) == curthread)
403 			break;
404 
405 		owner = rw->rw_owner;
406 	}
407 	KPREEMPT_ENABLE(curlwp);
408 
409 	LOCKSTAT_EVENT_RA(lsflag, rw, LB_RWLOCK |
410 	    (op == RW_WRITER ? LB_SLEEP1 : LB_SLEEP2), slpcnt, slptime,
411 	    (l->l_rwcallsite != 0 ? l->l_rwcallsite :
412 	      (uintptr_t)__builtin_return_address(0)));
413 	LOCKSTAT_EVENT_RA(lsflag, rw, LB_RWLOCK | LB_SPIN, spincnt, spintime,
414 	    (l->l_rwcallsite != 0 ? l->l_rwcallsite :
415 	      (uintptr_t)__builtin_return_address(0)));
416 	LOCKSTAT_EXIT(lsflag);
417 
418 	RW_ASSERT(rw, (op != RW_READER && RW_OWNER(rw) == curthread) ||
419 	    (op == RW_READER && RW_COUNT(rw) != 0));
420 	RW_LOCKED(rw, op);
421 }
422 
423 /*
424  * rw_vector_exit:
425  *
426  *	Release a rwlock.
427  */
428 void
429 rw_vector_exit(krwlock_t *rw)
430 {
431 	uintptr_t curthread, owner, decr, newown, next;
432 	turnstile_t *ts;
433 	int rcnt, wcnt;
434 	lwp_t *l;
435 
436 	l = curlwp;
437 	curthread = (uintptr_t)l;
438 	RW_ASSERT(rw, curthread != 0);
439 
440 	/*
441 	 * Again, we use a trick.  Since we used an add operation to
442 	 * set the required lock bits, we can use a subtract to clear
443 	 * them, which makes the read-release and write-release path
444 	 * the same.
445 	 */
446 	owner = rw->rw_owner;
447 	if (__predict_false((owner & RW_WRITE_LOCKED) != 0)) {
448 		RW_UNLOCKED(rw, RW_WRITER);
449 		RW_ASSERT(rw, RW_OWNER(rw) == curthread);
450 		decr = curthread | RW_WRITE_LOCKED;
451 	} else {
452 		RW_UNLOCKED(rw, RW_READER);
453 		RW_ASSERT(rw, RW_COUNT(rw) != 0);
454 		decr = RW_READ_INCR;
455 	}
456 
457 	/*
458 	 * Compute what we expect the new value of the lock to be. Only
459 	 * proceed to do direct handoff if there are waiters, and if the
460 	 * lock would become unowned.
461 	 */
462 	membar_release();
463 	for (;;) {
464 		newown = (owner - decr);
465 		if ((newown & (RW_THREAD | RW_HAS_WAITERS)) == RW_HAS_WAITERS)
466 			break;
467 		next = rw_cas(rw, owner, newown);
468 		if (__predict_true(next == owner))
469 			return;
470 		owner = next;
471 	}
472 
473 	/*
474 	 * Grab the turnstile chain lock.  This gets the interlock
475 	 * on the sleep queue.  Once we have that, we can adjust the
476 	 * waiter bits.
477 	 */
478 	ts = turnstile_lookup(rw);
479 	owner = rw->rw_owner;
480 	RW_ASSERT(rw, ts != NULL);
481 	RW_ASSERT(rw, (owner & RW_HAS_WAITERS) != 0);
482 
483 	wcnt = TS_WAITERS(ts, TS_WRITER_Q);
484 	rcnt = TS_WAITERS(ts, TS_READER_Q);
485 
486 	/*
487 	 * Give the lock away.
488 	 *
489 	 * If we are releasing a write lock, then prefer to wake all
490 	 * outstanding readers.  Otherwise, wake one writer if there
491 	 * are outstanding readers, or all writers if there are no
492 	 * pending readers.  If waking one specific writer, the writer
493 	 * is handed the lock here.  If waking multiple writers, we
494 	 * set WRITE_WANTED to block out new readers, and let them
495 	 * do the work of acquiring the lock in rw_vector_enter().
496 	 */
497 	if (rcnt == 0 || decr == RW_READ_INCR) {
498 		RW_ASSERT(rw, wcnt != 0);
499 		RW_ASSERT(rw, (owner & RW_WRITE_WANTED) != 0);
500 
501 		if (rcnt != 0) {
502 			/* Give the lock to the longest waiting writer. */
503 			l = TS_FIRST(ts, TS_WRITER_Q);
504 			newown = (uintptr_t)l | (owner & RW_NODEBUG);
505 			newown |= RW_WRITE_LOCKED | RW_HAS_WAITERS;
506 			if (wcnt > 1)
507 				newown |= RW_WRITE_WANTED;
508 			rw_swap(rw, owner, newown);
509 			turnstile_wakeup(ts, TS_WRITER_Q, 1, l);
510 		} else {
511 			/* Wake all writers and let them fight it out. */
512 			newown = owner & RW_NODEBUG;
513 			newown |= RW_WRITE_WANTED;
514 			rw_swap(rw, owner, newown);
515 			turnstile_wakeup(ts, TS_WRITER_Q, wcnt, NULL);
516 		}
517 	} else {
518 		RW_ASSERT(rw, rcnt != 0);
519 
520 		/*
521 		 * Give the lock to all blocked readers.  If there
522 		 * is a writer waiting, new readers that arrive
523 		 * after the release will be blocked out.
524 		 */
525 		newown = owner & RW_NODEBUG;
526 		newown += rcnt << RW_READ_COUNT_SHIFT;
527 		if (wcnt != 0)
528 			newown |= RW_HAS_WAITERS | RW_WRITE_WANTED;
529 
530 		/* Wake up all sleeping readers. */
531 		rw_swap(rw, owner, newown);
532 		turnstile_wakeup(ts, TS_READER_Q, rcnt, NULL);
533 	}
534 }
535 
536 /*
537  * rw_vector_tryenter:
538  *
539  *	Try to acquire a rwlock.
540  */
541 int
542 rw_vector_tryenter(krwlock_t *rw, const krw_t op)
543 {
544 	uintptr_t curthread, owner, incr, need_wait, next;
545 	lwp_t *l;
546 
547 	l = curlwp;
548 	curthread = (uintptr_t)l;
549 
550 	RW_ASSERT(rw, curthread != 0);
551 
552 	if (op == RW_READER) {
553 		incr = RW_READ_INCR;
554 		need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED;
555 	} else {
556 		RW_ASSERT(rw, op == RW_WRITER);
557 		incr = curthread | RW_WRITE_LOCKED;
558 		need_wait = RW_WRITE_LOCKED | RW_THREAD;
559 	}
560 
561 	for (owner = rw->rw_owner;; owner = next) {
562 		if (__predict_false((owner & need_wait) != 0))
563 			return 0;
564 		next = rw_cas(rw, owner, owner + incr);
565 		if (__predict_true(next == owner)) {
566 			/* Got it! */
567 			break;
568 		}
569 	}
570 
571 	RW_WANTLOCK(rw, op);
572 	RW_LOCKED(rw, op);
573 	RW_ASSERT(rw, (op != RW_READER && RW_OWNER(rw) == curthread) ||
574 	    (op == RW_READER && RW_COUNT(rw) != 0));
575 
576 	membar_acquire();
577 	return 1;
578 }
579 
580 /*
581  * rw_downgrade:
582  *
583  *	Downgrade a write lock to a read lock.
584  */
585 void
586 rw_downgrade(krwlock_t *rw)
587 {
588 	uintptr_t owner, curthread, newown, next;
589 	turnstile_t *ts;
590 	int rcnt, wcnt;
591 	lwp_t *l;
592 
593 	l = curlwp;
594 	curthread = (uintptr_t)l;
595 	RW_ASSERT(rw, curthread != 0);
596 	RW_ASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) != 0);
597 	RW_ASSERT(rw, RW_OWNER(rw) == curthread);
598 	RW_UNLOCKED(rw, RW_WRITER);
599 #if !defined(DIAGNOSTIC)
600 	__USE(curthread);
601 #endif
602 
603 	membar_release();
604 	for (owner = rw->rw_owner;; owner = next) {
605 		/*
606 		 * If there are no waiters we can do this the easy way.  Try
607 		 * swapping us down to one read hold.  If it fails, the lock
608 		 * condition has changed and we most likely now have
609 		 * waiters.
610 		 */
611 		if ((owner & RW_HAS_WAITERS) == 0) {
612 			newown = (owner & RW_NODEBUG);
613 			next = rw_cas(rw, owner, newown + RW_READ_INCR);
614 			if (__predict_true(next == owner)) {
615 				RW_LOCKED(rw, RW_READER);
616 				RW_ASSERT(rw,
617 				    (rw->rw_owner & RW_WRITE_LOCKED) == 0);
618 				RW_ASSERT(rw, RW_COUNT(rw) != 0);
619 				return;
620 			}
621 			continue;
622 		}
623 
624 		/*
625 		 * Grab the turnstile chain lock.  This gets the interlock
626 		 * on the sleep queue.  Once we have that, we can adjust the
627 		 * waiter bits.
628 		 */
629 		ts = turnstile_lookup(rw);
630 		RW_ASSERT(rw, ts != NULL);
631 
632 		rcnt = TS_WAITERS(ts, TS_READER_Q);
633 		wcnt = TS_WAITERS(ts, TS_WRITER_Q);
634 
635 		if (rcnt == 0) {
636 			/*
637 			 * If there are no readers, just preserve the
638 			 * waiters bits, swap us down to one read hold and
639 			 * return.
640 			 */
641 			RW_ASSERT(rw, wcnt != 0);
642 			RW_ASSERT(rw, (rw->rw_owner & RW_WRITE_WANTED) != 0);
643 			RW_ASSERT(rw, (rw->rw_owner & RW_HAS_WAITERS) != 0);
644 
645 			newown = owner & RW_NODEBUG;
646 			newown |= RW_READ_INCR | RW_HAS_WAITERS |
647 			    RW_WRITE_WANTED;
648 			next = rw_cas(rw, owner, newown);
649 			turnstile_exit(rw);
650 			if (__predict_true(next == owner))
651 				break;
652 		} else {
653 			/*
654 			 * Give the lock to all blocked readers.  We may
655 			 * retain one read hold if downgrading.  If there is
656 			 * a writer waiting, new readers will be blocked
657 			 * out.
658 			 */
659 			newown = owner & RW_NODEBUG;
660 			newown += (rcnt << RW_READ_COUNT_SHIFT) + RW_READ_INCR;
661 			if (wcnt != 0)
662 				newown |= RW_HAS_WAITERS | RW_WRITE_WANTED;
663 
664 			next = rw_cas(rw, owner, newown);
665 			if (__predict_true(next == owner)) {
666 				/* Wake up all sleeping readers. */
667 				turnstile_wakeup(ts, TS_READER_Q, rcnt, NULL);
668 				break;
669 			}
670 			turnstile_exit(rw);
671 		}
672 	}
673 
674 	RW_WANTLOCK(rw, RW_READER);
675 	RW_LOCKED(rw, RW_READER);
676 	RW_ASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) == 0);
677 	RW_ASSERT(rw, RW_COUNT(rw) != 0);
678 }
679 
680 /*
681  * rw_tryupgrade:
682  *
683  *	Try to upgrade a read lock to a write lock.  We must be the only
684  *	reader.
685  */
686 int
687 rw_tryupgrade(krwlock_t *rw)
688 {
689 	uintptr_t owner, curthread, newown, next;
690 	struct lwp *l;
691 
692 	l = curlwp;
693 	curthread = (uintptr_t)l;
694 	RW_ASSERT(rw, curthread != 0);
695 	RW_ASSERT(rw, rw_read_held(rw));
696 
697 	for (owner = RW_READ_INCR;; owner = next) {
698 		newown = curthread | RW_WRITE_LOCKED | (owner & ~RW_THREAD);
699 		next = rw_cas(rw, owner, newown);
700 		if (__predict_true(next == owner)) {
701 			membar_acquire();
702 			break;
703 		}
704 		RW_ASSERT(rw, (next & RW_WRITE_LOCKED) == 0);
705 		if (__predict_false((next & RW_THREAD) != RW_READ_INCR)) {
706 			RW_ASSERT(rw, (next & RW_THREAD) != 0);
707 			return 0;
708 		}
709 	}
710 
711 	RW_UNLOCKED(rw, RW_READER);
712 	RW_WANTLOCK(rw, RW_WRITER);
713 	RW_LOCKED(rw, RW_WRITER);
714 	RW_ASSERT(rw, rw->rw_owner & RW_WRITE_LOCKED);
715 	RW_ASSERT(rw, RW_OWNER(rw) == curthread);
716 
717 	return 1;
718 }
719 
720 /*
721  * rw_read_held:
722  *
723  *	Returns true if the rwlock is held for reading.  Must only be
724  *	used for diagnostic assertions, and never be used to make
725  * 	decisions about how to use a rwlock.
726  */
727 int
728 rw_read_held(krwlock_t *rw)
729 {
730 	uintptr_t owner;
731 
732 	if (rw == NULL)
733 		return 0;
734 	owner = rw->rw_owner;
735 	return (owner & RW_WRITE_LOCKED) == 0 && (owner & RW_THREAD) != 0;
736 }
737 
738 /*
739  * rw_write_held:
740  *
741  *	Returns true if the rwlock is held for writing.  Must only be
742  *	used for diagnostic assertions, and never be used to make
743  *	decisions about how to use a rwlock.
744  */
745 int
746 rw_write_held(krwlock_t *rw)
747 {
748 
749 	if (rw == NULL)
750 		return 0;
751 	return (rw->rw_owner & (RW_WRITE_LOCKED | RW_THREAD)) ==
752 	    (RW_WRITE_LOCKED | (uintptr_t)curlwp);
753 }
754 
755 /*
756  * rw_lock_held:
757  *
758  *	Returns true if the rwlock is held for reading or writing.  Must
759  *	only be used for diagnostic assertions, and never be used to make
760  *	decisions about how to use a rwlock.
761  */
762 int
763 rw_lock_held(krwlock_t *rw)
764 {
765 
766 	if (rw == NULL)
767 		return 0;
768 	return (rw->rw_owner & RW_THREAD) != 0;
769 }
770 
771 /*
772  * rw_lock_op:
773  *
774  *	For a rwlock that is known to be held by the caller, return
775  *	RW_READER or RW_WRITER to describe the hold type.
776  */
777 krw_t
778 rw_lock_op(krwlock_t *rw)
779 {
780 
781 	RW_ASSERT(rw, rw_lock_held(rw));
782 
783 	return (rw->rw_owner & RW_WRITE_LOCKED) != 0 ? RW_WRITER : RW_READER;
784 }
785 
786 /*
787  * rw_owner:
788  *
789  *	Return the current owner of an RW lock, but only if it is write
790  *	held.  Used for priority inheritance.
791  */
792 static lwp_t *
793 rw_owner(wchan_t obj)
794 {
795 	krwlock_t *rw = (void *)(uintptr_t)obj; /* discard qualifiers */
796 	uintptr_t owner = rw->rw_owner;
797 
798 	if ((owner & RW_WRITE_LOCKED) == 0)
799 		return NULL;
800 
801 	return (void *)(owner & RW_THREAD);
802 }
803 
804 /*
805  * rw_owner_running:
806  *
807  *	Return true if a RW lock is unheld, or write held and the owner is
808  *	running on a CPU.  For the pagedaemon.
809  */
810 bool
811 rw_owner_running(const krwlock_t *rw)
812 {
813 #ifdef MULTIPROCESSOR
814 	uintptr_t owner;
815 	bool rv;
816 
817 	kpreempt_disable();
818 	owner = rw->rw_owner;
819 	rv = (owner & RW_THREAD) == 0 || rw_oncpu(owner);
820 	kpreempt_enable();
821 	return rv;
822 #else
823 	return rw_owner(rw) == curlwp;
824 #endif
825 }
826