xref: /netbsd-src/sys/kern/kern_rwlock.c (revision ce099b40997c43048fb78bd578195f81d2456523)
1 /*	$NetBSD: kern_rwlock.c,v 1.22 2008/04/28 20:24:03 martin Exp $	*/
2 
3 /*-
4  * Copyright (c) 2002, 2006, 2007, 2008 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe and Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Kernel reader/writer lock implementation, modeled after those
34  * found in Solaris, a description of which can be found in:
35  *
36  *	Solaris Internals: Core Kernel Architecture, Jim Mauro and
37  *	    Richard McDougall.
38  */
39 
40 #include <sys/cdefs.h>
41 __KERNEL_RCSID(0, "$NetBSD: kern_rwlock.c,v 1.22 2008/04/28 20:24:03 martin Exp $");
42 
43 #include "opt_multiprocessor.h"
44 
45 #define	__RWLOCK_PRIVATE
46 
47 #include <sys/param.h>
48 #include <sys/proc.h>
49 #include <sys/rwlock.h>
50 #include <sys/sched.h>
51 #include <sys/sleepq.h>
52 #include <sys/systm.h>
53 #include <sys/lockdebug.h>
54 #include <sys/cpu.h>
55 #include <sys/atomic.h>
56 #include <sys/lock.h>
57 
58 #include <dev/lockstat.h>
59 
60 /*
61  * LOCKDEBUG
62  */
63 
64 #if defined(LOCKDEBUG)
65 
66 #define	RW_WANTLOCK(rw, op)						\
67 	LOCKDEBUG_WANTLOCK(RW_DEBUG_P(rw), (rw),			\
68 	    (uintptr_t)__builtin_return_address(0), op == RW_READER);
69 #define	RW_LOCKED(rw, op)						\
70 	LOCKDEBUG_LOCKED(RW_DEBUG_P(rw), (rw),				\
71 	    (uintptr_t)__builtin_return_address(0), op == RW_READER);
72 #define	RW_UNLOCKED(rw, op)						\
73 	LOCKDEBUG_UNLOCKED(RW_DEBUG_P(rw), (rw),			\
74 	    (uintptr_t)__builtin_return_address(0), op == RW_READER);
75 #define	RW_DASSERT(rw, cond)						\
76 do {									\
77 	if (!(cond))							\
78 		rw_abort(rw, __func__, "assertion failed: " #cond);	\
79 } while (/* CONSTCOND */ 0);
80 
81 #else	/* LOCKDEBUG */
82 
83 #define	RW_WANTLOCK(rw, op)	/* nothing */
84 #define	RW_LOCKED(rw, op)	/* nothing */
85 #define	RW_UNLOCKED(rw, op)	/* nothing */
86 #define	RW_DASSERT(rw, cond)	/* nothing */
87 
88 #endif	/* LOCKDEBUG */
89 
90 /*
91  * DIAGNOSTIC
92  */
93 
94 #if defined(DIAGNOSTIC)
95 
96 #define	RW_ASSERT(rw, cond)						\
97 do {									\
98 	if (!(cond))							\
99 		rw_abort(rw, __func__, "assertion failed: " #cond);	\
100 } while (/* CONSTCOND */ 0)
101 
102 #else
103 
104 #define	RW_ASSERT(rw, cond)	/* nothing */
105 
106 #endif	/* DIAGNOSTIC */
107 
108 #define	RW_SETDEBUG(rw, on)		((rw)->rw_owner |= (on) ? RW_DEBUG : 0)
109 #define	RW_DEBUG_P(rw)			(((rw)->rw_owner & RW_DEBUG) != 0)
110 #if defined(LOCKDEBUG)
111 #define	RW_INHERITDEBUG(new, old)	(new) |= (old) & RW_DEBUG
112 #else /* defined(LOCKDEBUG) */
113 #define	RW_INHERITDEBUG(new, old)	/* nothing */
114 #endif /* defined(LOCKDEBUG) */
115 
116 static void	rw_abort(krwlock_t *, const char *, const char *);
117 static void	rw_dump(volatile void *);
118 static lwp_t	*rw_owner(wchan_t);
119 
120 static inline uintptr_t
121 rw_cas(krwlock_t *rw, uintptr_t o, uintptr_t n)
122 {
123 
124 	RW_INHERITDEBUG(n, o);
125 	return (uintptr_t)atomic_cas_ptr((volatile void *)&rw->rw_owner,
126 	    (void *)o, (void *)n);
127 }
128 
129 static inline void
130 rw_swap(krwlock_t *rw, uintptr_t o, uintptr_t n)
131 {
132 
133 	RW_INHERITDEBUG(n, o);
134 	n = (uintptr_t)atomic_swap_ptr((volatile void *)&rw->rw_owner,
135 	    (void *)n);
136 	RW_DASSERT(rw, n == o);
137 }
138 
139 /*
140  * For platforms that do not provide stubs, or for the LOCKDEBUG case.
141  */
142 #ifdef LOCKDEBUG
143 #undef	__HAVE_RW_STUBS
144 #endif
145 
146 #ifndef __HAVE_RW_STUBS
147 __strong_alias(rw_enter,rw_vector_enter);
148 __strong_alias(rw_exit,rw_vector_exit);
149 __strong_alias(rw_tryenter,rw_vector_tryenter);
150 #endif
151 
152 lockops_t rwlock_lockops = {
153 	"Reader / writer lock",
154 	1,
155 	rw_dump
156 };
157 
158 syncobj_t rw_syncobj = {
159 	SOBJ_SLEEPQ_SORTED,
160 	turnstile_unsleep,
161 	turnstile_changepri,
162 	sleepq_lendpri,
163 	rw_owner,
164 };
165 
166 /*
167  * rw_dump:
168  *
169  *	Dump the contents of a rwlock structure.
170  */
171 static void
172 rw_dump(volatile void *cookie)
173 {
174 	volatile krwlock_t *rw = cookie;
175 
176 	printf_nolog("owner/count  : %#018lx flags    : %#018x\n",
177 	    (long)RW_OWNER(rw), (int)RW_FLAGS(rw));
178 }
179 
180 /*
181  * rw_abort:
182  *
183  *	Dump information about an error and panic the system.  This
184  *	generates a lot of machine code in the DIAGNOSTIC case, so
185  *	we ask the compiler to not inline it.
186  */
187 #if __GNUC_PREREQ__(3, 0)
188 __attribute ((noinline))
189 #endif
190 static void
191 rw_abort(krwlock_t *rw, const char *func, const char *msg)
192 {
193 
194 	if (panicstr != NULL)
195 		return;
196 
197 	LOCKDEBUG_ABORT(rw, &rwlock_lockops, func, msg);
198 }
199 
200 /*
201  * rw_init:
202  *
203  *	Initialize a rwlock for use.
204  */
205 void
206 rw_init(krwlock_t *rw)
207 {
208 	bool dodebug;
209 
210 	memset(rw, 0, sizeof(*rw));
211 
212 	dodebug = LOCKDEBUG_ALLOC(rw, &rwlock_lockops,
213 	    (uintptr_t)__builtin_return_address(0));
214 	RW_SETDEBUG(rw, dodebug);
215 }
216 
217 /*
218  * rw_destroy:
219  *
220  *	Tear down a rwlock.
221  */
222 void
223 rw_destroy(krwlock_t *rw)
224 {
225 
226 	RW_ASSERT(rw, (rw->rw_owner & ~RW_DEBUG) == 0);
227 	LOCKDEBUG_FREE(RW_DEBUG_P(rw), rw);
228 }
229 
230 /*
231  * rw_onproc:
232  *
233  *	Return true if an rwlock owner is running on a CPU in the system.
234  *	If the target is waiting on the kernel big lock, then we must
235  *	release it.  This is necessary to avoid deadlock.
236  *
237  *	Note that we can't use the rwlock owner field as an LWP pointer.  We
238  *	don't have full control over the timing of our execution, and so the
239  *	pointer could be completely invalid by the time we dereference it.
240  */
241 static int
242 rw_onproc(uintptr_t owner, struct cpu_info **cip)
243 {
244 #ifdef MULTIPROCESSOR
245 	CPU_INFO_ITERATOR cii;
246 	struct cpu_info *ci;
247 	lwp_t *l;
248 
249 	if ((owner & (RW_WRITE_LOCKED|RW_HAS_WAITERS)) != RW_WRITE_LOCKED)
250 		return 0;
251 	l = (lwp_t *)(owner & RW_THREAD);
252 
253 	/* See if the target is running on a CPU somewhere. */
254 	if ((ci = *cip) != NULL && ci->ci_curlwp == l)
255 		goto run;
256 	for (CPU_INFO_FOREACH(cii, ci))
257 		if (ci->ci_curlwp == l)
258 			goto run;
259 
260 	/* No: it may be safe to block now. */
261 	*cip = NULL;
262 	return 0;
263 
264  run:
265  	/* Target is running; do we need to block? */
266  	*cip = ci;
267 	return ci->ci_biglock_wanted != l;
268 #else
269 	return 0;
270 #endif	/* MULTIPROCESSOR */
271 }
272 
273 /*
274  * rw_vector_enter:
275  *
276  *	Acquire a rwlock.
277  */
278 void
279 rw_vector_enter(krwlock_t *rw, const krw_t op)
280 {
281 	uintptr_t owner, incr, need_wait, set_wait, curthread, next;
282 	struct cpu_info *ci;
283 	turnstile_t *ts;
284 	int queue;
285 	lwp_t *l;
286 	LOCKSTAT_TIMER(slptime);
287 	LOCKSTAT_TIMER(slpcnt);
288 	LOCKSTAT_TIMER(spintime);
289 	LOCKSTAT_COUNTER(spincnt);
290 	LOCKSTAT_FLAG(lsflag);
291 
292 	l = curlwp;
293 	curthread = (uintptr_t)l;
294 
295 	RW_ASSERT(rw, !cpu_intr_p());
296 	RW_ASSERT(rw, curthread != 0);
297 	RW_WANTLOCK(rw, op);
298 
299 	if (panicstr == NULL) {
300 		LOCKDEBUG_BARRIER(&kernel_lock, 1);
301 	}
302 
303 	/*
304 	 * We play a slight trick here.  If we're a reader, we want
305 	 * increment the read count.  If we're a writer, we want to
306 	 * set the owner field and whe WRITE_LOCKED bit.
307 	 *
308 	 * In the latter case, we expect those bits to be zero,
309 	 * therefore we can use an add operation to set them, which
310 	 * means an add operation for both cases.
311 	 */
312 	if (__predict_true(op == RW_READER)) {
313 		incr = RW_READ_INCR;
314 		set_wait = RW_HAS_WAITERS;
315 		need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED;
316 		queue = TS_READER_Q;
317 	} else {
318 		RW_DASSERT(rw, op == RW_WRITER);
319 		incr = curthread | RW_WRITE_LOCKED;
320 		set_wait = RW_HAS_WAITERS | RW_WRITE_WANTED;
321 		need_wait = RW_WRITE_LOCKED | RW_THREAD;
322 		queue = TS_WRITER_Q;
323 	}
324 
325 	LOCKSTAT_ENTER(lsflag);
326 
327 	for (ci = NULL, owner = rw->rw_owner;;) {
328 		/*
329 		 * Read the lock owner field.  If the need-to-wait
330 		 * indicator is clear, then try to acquire the lock.
331 		 */
332 		if ((owner & need_wait) == 0) {
333 			next = rw_cas(rw, owner, (owner + incr) &
334 			    ~RW_WRITE_WANTED);
335 			if (__predict_true(next == owner)) {
336 				/* Got it! */
337 #ifndef __HAVE_ATOMIC_AS_MEMBAR
338 				membar_enter();
339 #endif
340 				break;
341 			}
342 
343 			/*
344 			 * Didn't get it -- spin around again (we'll
345 			 * probably sleep on the next iteration).
346 			 */
347 			owner = next;
348 			continue;
349 		}
350 
351 		if (__predict_false(panicstr != NULL))
352 			return;
353 		if (__predict_false(RW_OWNER(rw) == curthread))
354 			rw_abort(rw, __func__, "locking against myself");
355 
356 		/*
357 		 * If the lock owner is running on another CPU, and
358 		 * there are no existing waiters, then spin.
359 		 */
360 		if (rw_onproc(owner, &ci)) {
361 			LOCKSTAT_START_TIMER(lsflag, spintime);
362 			u_int count = SPINLOCK_BACKOFF_MIN;
363 			do {
364 				SPINLOCK_BACKOFF(count);
365 				owner = rw->rw_owner;
366 			} while (rw_onproc(owner, &ci));
367 			LOCKSTAT_STOP_TIMER(lsflag, spintime);
368 			LOCKSTAT_COUNT(spincnt, 1);
369 			if ((owner & need_wait) == 0)
370 				continue;
371 		}
372 
373 		/*
374 		 * Grab the turnstile chain lock.  Once we have that, we
375 		 * can adjust the waiter bits and sleep queue.
376 		 */
377 		ts = turnstile_lookup(rw);
378 
379 		/*
380 		 * Mark the rwlock as having waiters.  If the set fails,
381 		 * then we may not need to sleep and should spin again.
382 		 * Reload rw_owner because turnstile_lookup() may have
383 		 * spun on the turnstile chain lock.
384 		 */
385 		owner = rw->rw_owner;
386 		if ((owner & need_wait) == 0 || rw_onproc(owner, &ci)) {
387 			turnstile_exit(rw);
388 			continue;
389 		}
390 		next = rw_cas(rw, owner, owner | set_wait);
391 		if (__predict_false(next != owner)) {
392 			turnstile_exit(rw);
393 			owner = next;
394 			continue;
395 		}
396 
397 		LOCKSTAT_START_TIMER(lsflag, slptime);
398 		turnstile_block(ts, queue, rw, &rw_syncobj);
399 		LOCKSTAT_STOP_TIMER(lsflag, slptime);
400 		LOCKSTAT_COUNT(slpcnt, 1);
401 
402 		/*
403 		 * No need for a memory barrier because of context switch.
404 		 * If not handed the lock, then spin again.
405 		 */
406 		if (op == RW_READER || (rw->rw_owner & RW_THREAD) == curthread)
407 			break;
408 	}
409 
410 	LOCKSTAT_EVENT(lsflag, rw, LB_RWLOCK |
411 	    (op == RW_WRITER ? LB_SLEEP1 : LB_SLEEP2), slpcnt, slptime);
412 	LOCKSTAT_EVENT(lsflag, rw, LB_RWLOCK | LB_SPIN, spincnt, spintime);
413 	LOCKSTAT_EXIT(lsflag);
414 
415 	RW_DASSERT(rw, (op != RW_READER && RW_OWNER(rw) == curthread) ||
416 	    (op == RW_READER && RW_COUNT(rw) != 0));
417 	RW_LOCKED(rw, op);
418 }
419 
420 /*
421  * rw_vector_exit:
422  *
423  *	Release a rwlock.
424  */
425 void
426 rw_vector_exit(krwlock_t *rw)
427 {
428 	uintptr_t curthread, owner, decr, new, next;
429 	turnstile_t *ts;
430 	int rcnt, wcnt;
431 	lwp_t *l;
432 
433 	curthread = (uintptr_t)curlwp;
434 	RW_ASSERT(rw, curthread != 0);
435 
436 	if (__predict_false(panicstr != NULL))
437 		return;
438 
439 	/*
440 	 * Again, we use a trick.  Since we used an add operation to
441 	 * set the required lock bits, we can use a subtract to clear
442 	 * them, which makes the read-release and write-release path
443 	 * the same.
444 	 */
445 	owner = rw->rw_owner;
446 	if (__predict_false((owner & RW_WRITE_LOCKED) != 0)) {
447 		RW_UNLOCKED(rw, RW_WRITER);
448 		RW_ASSERT(rw, RW_OWNER(rw) == curthread);
449 		decr = curthread | RW_WRITE_LOCKED;
450 	} else {
451 		RW_UNLOCKED(rw, RW_READER);
452 		RW_ASSERT(rw, RW_COUNT(rw) != 0);
453 		decr = RW_READ_INCR;
454 	}
455 
456 	/*
457 	 * Compute what we expect the new value of the lock to be. Only
458 	 * proceed to do direct handoff if there are waiters, and if the
459 	 * lock would become unowned.
460 	 */
461 #ifndef __HAVE_ATOMIC_AS_MEMBAR
462 	membar_exit();
463 #endif
464 	for (;;) {
465 		new = (owner - decr);
466 		if ((new & (RW_THREAD | RW_HAS_WAITERS)) == RW_HAS_WAITERS)
467 			break;
468 		next = rw_cas(rw, owner, new);
469 		if (__predict_true(next == owner))
470 			return;
471 		owner = next;
472 	}
473 
474 	/*
475 	 * Grab the turnstile chain lock.  This gets the interlock
476 	 * on the sleep queue.  Once we have that, we can adjust the
477 	 * waiter bits.
478 	 */
479 	ts = turnstile_lookup(rw);
480 	owner = rw->rw_owner;
481 	RW_DASSERT(rw, ts != NULL);
482 	RW_DASSERT(rw, (owner & RW_HAS_WAITERS) != 0);
483 
484 	wcnt = TS_WAITERS(ts, TS_WRITER_Q);
485 	rcnt = TS_WAITERS(ts, TS_READER_Q);
486 
487 	/*
488 	 * Give the lock away.
489 	 *
490 	 * If we are releasing a write lock, then prefer to wake all
491 	 * outstanding readers.  Otherwise, wake one writer if there
492 	 * are outstanding readers, or all writers if there are no
493 	 * pending readers.  If waking one specific writer, the writer
494 	 * is handed the lock here.  If waking multiple writers, we
495 	 * set WRITE_WANTED to block out new readers, and let them
496 	 * do the work of acquring the lock in rw_vector_enter().
497 	 */
498 	if (rcnt == 0 || (decr == RW_READ_INCR && wcnt != 0)) {
499 		RW_DASSERT(rw, wcnt != 0);
500 		RW_DASSERT(rw, (owner & RW_WRITE_WANTED) != 0);
501 
502 		if (rcnt != 0) {
503 			/* Give the lock to the longest waiting writer. */
504 			l = TS_FIRST(ts, TS_WRITER_Q);
505 			new = (uintptr_t)l | RW_WRITE_LOCKED | RW_HAS_WAITERS;
506 			if (wcnt != 0)
507 				new |= RW_WRITE_WANTED;
508 			rw_swap(rw, owner, new);
509 			turnstile_wakeup(ts, TS_WRITER_Q, 1, l);
510 		} else {
511 			/* Wake all writers and let them fight it out. */
512 			rw_swap(rw, owner, RW_WRITE_WANTED);
513 			turnstile_wakeup(ts, TS_WRITER_Q, wcnt, NULL);
514 		}
515 	} else {
516 		RW_DASSERT(rw, rcnt != 0);
517 
518 		/*
519 		 * Give the lock to all blocked readers.  If there
520 		 * is a writer waiting, new readers that arrive
521 		 * after the release will be blocked out.
522 		 */
523 		new = rcnt << RW_READ_COUNT_SHIFT;
524 		if (wcnt != 0)
525 			new |= RW_HAS_WAITERS | RW_WRITE_WANTED;
526 
527 		/* Wake up all sleeping readers. */
528 		rw_swap(rw, owner, new);
529 		turnstile_wakeup(ts, TS_READER_Q, rcnt, NULL);
530 	}
531 }
532 
533 /*
534  * rw_vector_tryenter:
535  *
536  *	Try to acquire a rwlock.
537  */
538 int
539 rw_vector_tryenter(krwlock_t *rw, const krw_t op)
540 {
541 	uintptr_t curthread, owner, incr, need_wait, next;
542 
543 	curthread = (uintptr_t)curlwp;
544 
545 	RW_ASSERT(rw, curthread != 0);
546 
547 	if (op == RW_READER) {
548 		incr = RW_READ_INCR;
549 		need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED;
550 	} else {
551 		RW_DASSERT(rw, op == RW_WRITER);
552 		incr = curthread | RW_WRITE_LOCKED;
553 		need_wait = RW_WRITE_LOCKED | RW_THREAD;
554 	}
555 
556 	for (owner = rw->rw_owner;; owner = next) {
557 		owner = rw->rw_owner;
558 		if (__predict_false((owner & need_wait) != 0))
559 			return 0;
560 		next = rw_cas(rw, owner, owner + incr);
561 		if (__predict_true(next == owner)) {
562 			/* Got it! */
563 			break;
564 		}
565 	}
566 
567 #ifndef __HAVE_ATOMIC_AS_MEMBAR
568 	membar_enter();
569 #endif
570 	RW_WANTLOCK(rw, op);
571 	RW_LOCKED(rw, op);
572 	RW_DASSERT(rw, (op != RW_READER && RW_OWNER(rw) == curthread) ||
573 	    (op == RW_READER && RW_COUNT(rw) != 0));
574 
575 	return 1;
576 }
577 
578 /*
579  * rw_downgrade:
580  *
581  *	Downgrade a write lock to a read lock.
582  */
583 void
584 rw_downgrade(krwlock_t *rw)
585 {
586 	uintptr_t owner, curthread, new, next;
587 	turnstile_t *ts;
588 	int rcnt, wcnt;
589 
590 	curthread = (uintptr_t)curlwp;
591 	RW_ASSERT(rw, curthread != 0);
592 	RW_DASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) != 0);
593 	RW_ASSERT(rw, RW_OWNER(rw) == curthread);
594 	RW_UNLOCKED(rw, RW_WRITER);
595 
596 #ifndef __HAVE_ATOMIC_AS_MEMBAR
597 	membar_producer();
598 #endif
599 
600 	owner = rw->rw_owner;
601 	if ((owner & RW_HAS_WAITERS) == 0) {
602 		/*
603 		 * There are no waiters, so we can do this the easy way.
604 		 * Try swapping us down to one read hold.  If it fails, the
605 		 * lock condition has changed and we most likely now have
606 		 * waiters.
607 		 */
608 		next = rw_cas(rw, owner, RW_READ_INCR);
609 		if (__predict_true(next == owner)) {
610 			RW_LOCKED(rw, RW_READER);
611 			RW_DASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) == 0);
612 			RW_DASSERT(rw, RW_COUNT(rw) != 0);
613 			return;
614 		}
615 		owner = next;
616 	}
617 
618 	/*
619 	 * Grab the turnstile chain lock.  This gets the interlock
620 	 * on the sleep queue.  Once we have that, we can adjust the
621 	 * waiter bits.
622 	 */
623 	for (;; owner = next) {
624 		ts = turnstile_lookup(rw);
625 		RW_DASSERT(rw, ts != NULL);
626 
627 		rcnt = TS_WAITERS(ts, TS_READER_Q);
628 		wcnt = TS_WAITERS(ts, TS_WRITER_Q);
629 
630 		/*
631 		 * If there are no readers, just preserve the waiters
632 		 * bits, swap us down to one read hold and return.
633 		 */
634 		if (rcnt == 0) {
635 			RW_DASSERT(rw, wcnt != 0);
636 			RW_DASSERT(rw, (rw->rw_owner & RW_WRITE_WANTED) != 0);
637 			RW_DASSERT(rw, (rw->rw_owner & RW_HAS_WAITERS) != 0);
638 
639 			new = RW_READ_INCR | RW_HAS_WAITERS | RW_WRITE_WANTED;
640 			next = rw_cas(rw, owner, new);
641 			turnstile_exit(ts);
642 			if (__predict_true(next == owner))
643 				break;
644 		} else {
645 			/*
646 			 * Give the lock to all blocked readers.  We may
647 			 * retain one read hold if downgrading.  If there
648 			 * is a writer waiting, new readers will be blocked
649 			 * out.
650 			 */
651 			new = (rcnt << RW_READ_COUNT_SHIFT) + RW_READ_INCR;
652 			if (wcnt != 0)
653 				new |= RW_HAS_WAITERS | RW_WRITE_WANTED;
654 
655 			next = rw_cas(rw, owner, new);
656 			if (__predict_true(next == owner)) {
657 				/* Wake up all sleeping readers. */
658 				turnstile_wakeup(ts, TS_READER_Q, rcnt, NULL);
659 				break;
660 			}
661 			turnstile_exit(ts);
662 		}
663 	}
664 
665 	RW_LOCKED(rw, RW_READER);
666 	RW_DASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) == 0);
667 	RW_DASSERT(rw, RW_COUNT(rw) != 0);
668 }
669 
670 /*
671  * rw_tryupgrade:
672  *
673  *	Try to upgrade a read lock to a write lock.  We must be the
674  *	only reader.
675  */
676 int
677 rw_tryupgrade(krwlock_t *rw)
678 {
679 	uintptr_t owner, curthread, new, next;
680 
681 	curthread = (uintptr_t)curlwp;
682 	RW_ASSERT(rw, curthread != 0);
683 	RW_WANTLOCK(rw, RW_WRITER);
684 
685 	for (owner = rw->rw_owner;; owner = next) {
686 		RW_ASSERT(rw, (owner & RW_WRITE_LOCKED) == 0);
687 		if (__predict_false((owner & RW_THREAD) != RW_READ_INCR)) {
688 			RW_ASSERT(rw, (owner & RW_THREAD) != 0);
689 			return 0;
690 		}
691 		new = curthread | RW_WRITE_LOCKED | (owner & ~RW_THREAD);
692 		next = rw_cas(rw, owner, new);
693 		if (__predict_true(next == owner))
694 			break;
695 	}
696 
697 	RW_UNLOCKED(rw, RW_READER);
698 	RW_LOCKED(rw, RW_WRITER);
699 	RW_DASSERT(rw, rw->rw_owner & RW_WRITE_LOCKED);
700 	RW_DASSERT(rw, RW_OWNER(rw) == curthread);
701 
702 #ifndef __HAVE_ATOMIC_AS_MEMBAR
703 	membar_producer();
704 #endif
705 
706 	return 1;
707 }
708 
709 /*
710  * rw_read_held:
711  *
712  *	Returns true if the rwlock is held for reading.  Must only be
713  *	used for diagnostic assertions, and never be used to make
714  * 	decisions about how to use a rwlock.
715  */
716 int
717 rw_read_held(krwlock_t *rw)
718 {
719 	uintptr_t owner;
720 
721 	if (panicstr != NULL)
722 		return 1;
723 	if (rw == NULL)
724 		return 0;
725 	owner = rw->rw_owner;
726 	return (owner & RW_WRITE_LOCKED) == 0 && (owner & RW_THREAD) != 0;
727 }
728 
729 /*
730  * rw_write_held:
731  *
732  *	Returns true if the rwlock is held for writing.  Must only be
733  *	used for diagnostic assertions, and never be used to make
734  *	decisions about how to use a rwlock.
735  */
736 int
737 rw_write_held(krwlock_t *rw)
738 {
739 
740 	if (panicstr != NULL)
741 		return 1;
742 	if (rw == NULL)
743 		return 0;
744 	return (rw->rw_owner & (RW_WRITE_LOCKED | RW_THREAD)) ==
745 	    (RW_WRITE_LOCKED | (uintptr_t)curlwp);
746 }
747 
748 /*
749  * rw_lock_held:
750  *
751  *	Returns true if the rwlock is held for reading or writing.  Must
752  *	only be used for diagnostic assertions, and never be used to make
753  *	decisions about how to use a rwlock.
754  */
755 int
756 rw_lock_held(krwlock_t *rw)
757 {
758 
759 	if (panicstr != NULL)
760 		return 1;
761 	if (rw == NULL)
762 		return 0;
763 	return (rw->rw_owner & RW_THREAD) != 0;
764 }
765 
766 /*
767  * rw_owner:
768  *
769  *	Return the current owner of an RW lock, but only if it is write
770  *	held.  Used for priority inheritance.
771  */
772 static lwp_t *
773 rw_owner(wchan_t obj)
774 {
775 	krwlock_t *rw = (void *)(uintptr_t)obj; /* discard qualifiers */
776 	uintptr_t owner = rw->rw_owner;
777 
778 	if ((owner & RW_WRITE_LOCKED) == 0)
779 		return NULL;
780 
781 	return (void *)(owner & RW_THREAD);
782 }
783