xref: /netbsd-src/sys/kern/kern_rwlock.c (revision bdc22b2e01993381dcefeff2bc9b56ca75a4235c)
1 /*	$NetBSD: kern_rwlock.c,v 1.50 2018/02/05 04:25:04 ozaki-r Exp $	*/
2 
3 /*-
4  * Copyright (c) 2002, 2006, 2007, 2008, 2009 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe and Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Kernel reader/writer lock implementation, modeled after those
34  * found in Solaris, a description of which can be found in:
35  *
36  *	Solaris Internals: Core Kernel Architecture, Jim Mauro and
37  *	    Richard McDougall.
38  */
39 
40 #include <sys/cdefs.h>
41 __KERNEL_RCSID(0, "$NetBSD: kern_rwlock.c,v 1.50 2018/02/05 04:25:04 ozaki-r Exp $");
42 
43 #define	__RWLOCK_PRIVATE
44 
45 #include <sys/param.h>
46 #include <sys/proc.h>
47 #include <sys/rwlock.h>
48 #include <sys/sched.h>
49 #include <sys/sleepq.h>
50 #include <sys/systm.h>
51 #include <sys/lockdebug.h>
52 #include <sys/cpu.h>
53 #include <sys/atomic.h>
54 #include <sys/lock.h>
55 
56 #include <dev/lockstat.h>
57 
58 /*
59  * LOCKDEBUG
60  */
61 
62 #if defined(LOCKDEBUG)
63 
64 #define	RW_WANTLOCK(rw, op)						\
65 	LOCKDEBUG_WANTLOCK(RW_DEBUG_P(rw), (rw),			\
66 	    (uintptr_t)__builtin_return_address(0), op == RW_READER);
67 #define	RW_LOCKED(rw, op)						\
68 	LOCKDEBUG_LOCKED(RW_DEBUG_P(rw), (rw), NULL,			\
69 	    (uintptr_t)__builtin_return_address(0), op == RW_READER);
70 #define	RW_UNLOCKED(rw, op)						\
71 	LOCKDEBUG_UNLOCKED(RW_DEBUG_P(rw), (rw),			\
72 	    (uintptr_t)__builtin_return_address(0), op == RW_READER);
73 #define	RW_DASSERT(rw, cond)						\
74 do {									\
75 	if (!(cond))							\
76 		rw_abort(__func__, __LINE__, rw, "assertion failed: " #cond);\
77 } while (/* CONSTCOND */ 0);
78 
79 #else	/* LOCKDEBUG */
80 
81 #define	RW_WANTLOCK(rw, op)	/* nothing */
82 #define	RW_LOCKED(rw, op)	/* nothing */
83 #define	RW_UNLOCKED(rw, op)	/* nothing */
84 #define	RW_DASSERT(rw, cond)	/* nothing */
85 
86 #endif	/* LOCKDEBUG */
87 
88 /*
89  * DIAGNOSTIC
90  */
91 
92 #if defined(DIAGNOSTIC)
93 
94 #define	RW_ASSERT(rw, cond)						\
95 do {									\
96 	if (!(cond))							\
97 		rw_abort(__func__, __LINE__, rw, "assertion failed: " #cond);\
98 } while (/* CONSTCOND */ 0)
99 
100 #else
101 
102 #define	RW_ASSERT(rw, cond)	/* nothing */
103 
104 #endif	/* DIAGNOSTIC */
105 
106 #define	RW_SETDEBUG(rw, on)		((rw)->rw_owner |= (on) ? 0 : RW_NODEBUG)
107 #define	RW_DEBUG_P(rw)			(((rw)->rw_owner & RW_NODEBUG) == 0)
108 #if defined(LOCKDEBUG)
109 #define	RW_INHERITDEBUG(n, o)		(n) |= (o) & RW_NODEBUG
110 #else /* defined(LOCKDEBUG) */
111 #define	RW_INHERITDEBUG(n, o)		/* nothing */
112 #endif /* defined(LOCKDEBUG) */
113 
114 static void	rw_abort(const char *, size_t, krwlock_t *, const char *);
115 static void	rw_dump(const volatile void *);
116 static lwp_t	*rw_owner(wchan_t);
117 
118 static inline uintptr_t
119 rw_cas(krwlock_t *rw, uintptr_t o, uintptr_t n)
120 {
121 
122 	RW_INHERITDEBUG(n, o);
123 	return (uintptr_t)atomic_cas_ptr((volatile void *)&rw->rw_owner,
124 	    (void *)o, (void *)n);
125 }
126 
127 static inline void
128 rw_swap(krwlock_t *rw, uintptr_t o, uintptr_t n)
129 {
130 
131 	RW_INHERITDEBUG(n, o);
132 	n = (uintptr_t)atomic_swap_ptr((volatile void *)&rw->rw_owner,
133 	    (void *)n);
134 	RW_DASSERT(rw, n == o);
135 }
136 
137 /*
138  * For platforms that do not provide stubs, or for the LOCKDEBUG case.
139  */
140 #ifdef LOCKDEBUG
141 #undef	__HAVE_RW_STUBS
142 #endif
143 
144 #ifndef __HAVE_RW_STUBS
145 __strong_alias(rw_enter,rw_vector_enter);
146 __strong_alias(rw_exit,rw_vector_exit);
147 __strong_alias(rw_tryenter,rw_vector_tryenter);
148 #endif
149 
150 lockops_t rwlock_lockops = {
151 	.lo_name = "Reader / writer lock",
152 	.lo_type = LOCKOPS_SLEEP,
153 	.lo_dump = rw_dump,
154 };
155 
156 syncobj_t rw_syncobj = {
157 	.sobj_flag	= SOBJ_SLEEPQ_SORTED,
158 	.sobj_unsleep	= turnstile_unsleep,
159 	.sobj_changepri	= turnstile_changepri,
160 	.sobj_lendpri	= sleepq_lendpri,
161 	.sobj_owner	= rw_owner,
162 };
163 
164 /*
165  * rw_dump:
166  *
167  *	Dump the contents of a rwlock structure.
168  */
169 static void
170 rw_dump(const volatile void *cookie)
171 {
172 	const volatile krwlock_t *rw = cookie;
173 
174 	printf_nolog("owner/count  : %#018lx flags    : %#018x\n",
175 	    (long)RW_OWNER(rw), (int)RW_FLAGS(rw));
176 }
177 
178 /*
179  * rw_abort:
180  *
181  *	Dump information about an error and panic the system.  This
182  *	generates a lot of machine code in the DIAGNOSTIC case, so
183  *	we ask the compiler to not inline it.
184  */
185 static void __noinline
186 rw_abort(const char *func, size_t line, krwlock_t *rw, const char *msg)
187 {
188 
189 	if (panicstr != NULL)
190 		return;
191 
192 	LOCKDEBUG_ABORT(func, line, rw, &rwlock_lockops, msg);
193 }
194 
195 /*
196  * rw_init:
197  *
198  *	Initialize a rwlock for use.
199  */
200 void _rw_init(krwlock_t *, uintptr_t);
201 void
202 _rw_init(krwlock_t *rw, uintptr_t return_address)
203 {
204 	bool dodebug;
205 
206 	memset(rw, 0, sizeof(*rw));
207 
208 	dodebug = LOCKDEBUG_ALLOC(rw, &rwlock_lockops, return_address);
209 	RW_SETDEBUG(rw, dodebug);
210 }
211 
212 void
213 rw_init(krwlock_t *rw)
214 {
215 
216 	_rw_init(rw, (uintptr_t)__builtin_return_address(0));
217 }
218 
219 /*
220  * rw_destroy:
221  *
222  *	Tear down a rwlock.
223  */
224 void
225 rw_destroy(krwlock_t *rw)
226 {
227 
228 	RW_ASSERT(rw, (rw->rw_owner & ~RW_NODEBUG) == 0);
229 	LOCKDEBUG_FREE(RW_DEBUG_P(rw), rw);
230 }
231 
232 /*
233  * rw_oncpu:
234  *
235  *	Return true if an rwlock owner is running on a CPU in the system.
236  *	If the target is waiting on the kernel big lock, then we must
237  *	release it.  This is necessary to avoid deadlock.
238  */
239 static bool
240 rw_oncpu(uintptr_t owner)
241 {
242 #ifdef MULTIPROCESSOR
243 	struct cpu_info *ci;
244 	lwp_t *l;
245 
246 	KASSERT(kpreempt_disabled());
247 
248 	if ((owner & (RW_WRITE_LOCKED|RW_HAS_WAITERS)) != RW_WRITE_LOCKED) {
249 		return false;
250 	}
251 
252 	/*
253 	 * See lwp_dtor() why dereference of the LWP pointer is safe.
254 	 * We must have kernel preemption disabled for that.
255 	 */
256 	l = (lwp_t *)(owner & RW_THREAD);
257 	ci = l->l_cpu;
258 
259 	if (ci && ci->ci_curlwp == l) {
260 		/* Target is running; do we need to block? */
261 		return (ci->ci_biglock_wanted != l);
262 	}
263 #endif
264 	/* Not running.  It may be safe to block now. */
265 	return false;
266 }
267 
268 /*
269  * rw_vector_enter:
270  *
271  *	Acquire a rwlock.
272  */
273 void
274 rw_vector_enter(krwlock_t *rw, const krw_t op)
275 {
276 	uintptr_t owner, incr, need_wait, set_wait, curthread, next;
277 	turnstile_t *ts;
278 	int queue;
279 	lwp_t *l;
280 	LOCKSTAT_TIMER(slptime);
281 	LOCKSTAT_TIMER(slpcnt);
282 	LOCKSTAT_TIMER(spintime);
283 	LOCKSTAT_COUNTER(spincnt);
284 	LOCKSTAT_FLAG(lsflag);
285 
286 	l = curlwp;
287 	curthread = (uintptr_t)l;
288 
289 	RW_ASSERT(rw, !cpu_intr_p());
290 	RW_ASSERT(rw, curthread != 0);
291 	RW_WANTLOCK(rw, op);
292 
293 	if (panicstr == NULL) {
294 		LOCKDEBUG_BARRIER(&kernel_lock, 1);
295 	}
296 
297 	/*
298 	 * We play a slight trick here.  If we're a reader, we want
299 	 * increment the read count.  If we're a writer, we want to
300 	 * set the owner field and the WRITE_LOCKED bit.
301 	 *
302 	 * In the latter case, we expect those bits to be zero,
303 	 * therefore we can use an add operation to set them, which
304 	 * means an add operation for both cases.
305 	 */
306 	if (__predict_true(op == RW_READER)) {
307 		incr = RW_READ_INCR;
308 		set_wait = RW_HAS_WAITERS;
309 		need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED;
310 		queue = TS_READER_Q;
311 	} else {
312 		RW_DASSERT(rw, op == RW_WRITER);
313 		incr = curthread | RW_WRITE_LOCKED;
314 		set_wait = RW_HAS_WAITERS | RW_WRITE_WANTED;
315 		need_wait = RW_WRITE_LOCKED | RW_THREAD;
316 		queue = TS_WRITER_Q;
317 	}
318 
319 	LOCKSTAT_ENTER(lsflag);
320 
321 	KPREEMPT_DISABLE(curlwp);
322 	for (owner = rw->rw_owner; ;) {
323 		/*
324 		 * Read the lock owner field.  If the need-to-wait
325 		 * indicator is clear, then try to acquire the lock.
326 		 */
327 		if ((owner & need_wait) == 0) {
328 			next = rw_cas(rw, owner, (owner + incr) &
329 			    ~RW_WRITE_WANTED);
330 			if (__predict_true(next == owner)) {
331 				/* Got it! */
332 				membar_enter();
333 				break;
334 			}
335 
336 			/*
337 			 * Didn't get it -- spin around again (we'll
338 			 * probably sleep on the next iteration).
339 			 */
340 			owner = next;
341 			continue;
342 		}
343 		if (__predict_false(panicstr != NULL)) {
344 			KPREEMPT_ENABLE(curlwp);
345 			return;
346 		}
347 		if (__predict_false(RW_OWNER(rw) == curthread)) {
348 			rw_abort(__func__, __LINE__, rw,
349 			    "locking against myself");
350 		}
351 		/*
352 		 * If the lock owner is running on another CPU, and
353 		 * there are no existing waiters, then spin.
354 		 */
355 		if (rw_oncpu(owner)) {
356 			LOCKSTAT_START_TIMER(lsflag, spintime);
357 			u_int count = SPINLOCK_BACKOFF_MIN;
358 			do {
359 				KPREEMPT_ENABLE(curlwp);
360 				SPINLOCK_BACKOFF(count);
361 				KPREEMPT_DISABLE(curlwp);
362 				owner = rw->rw_owner;
363 			} while (rw_oncpu(owner));
364 			LOCKSTAT_STOP_TIMER(lsflag, spintime);
365 			LOCKSTAT_COUNT(spincnt, 1);
366 			if ((owner & need_wait) == 0)
367 				continue;
368 		}
369 
370 		/*
371 		 * Grab the turnstile chain lock.  Once we have that, we
372 		 * can adjust the waiter bits and sleep queue.
373 		 */
374 		ts = turnstile_lookup(rw);
375 
376 		/*
377 		 * Mark the rwlock as having waiters.  If the set fails,
378 		 * then we may not need to sleep and should spin again.
379 		 * Reload rw_owner because turnstile_lookup() may have
380 		 * spun on the turnstile chain lock.
381 		 */
382 		owner = rw->rw_owner;
383 		if ((owner & need_wait) == 0 || rw_oncpu(owner)) {
384 			turnstile_exit(rw);
385 			continue;
386 		}
387 		next = rw_cas(rw, owner, owner | set_wait);
388 		if (__predict_false(next != owner)) {
389 			turnstile_exit(rw);
390 			owner = next;
391 			continue;
392 		}
393 
394 		LOCKSTAT_START_TIMER(lsflag, slptime);
395 		turnstile_block(ts, queue, rw, &rw_syncobj);
396 		LOCKSTAT_STOP_TIMER(lsflag, slptime);
397 		LOCKSTAT_COUNT(slpcnt, 1);
398 
399 		/*
400 		 * No need for a memory barrier because of context switch.
401 		 * If not handed the lock, then spin again.
402 		 */
403 		if (op == RW_READER || (rw->rw_owner & RW_THREAD) == curthread)
404 			break;
405 
406 		owner = rw->rw_owner;
407 	}
408 	KPREEMPT_ENABLE(curlwp);
409 
410 	LOCKSTAT_EVENT(lsflag, rw, LB_RWLOCK |
411 	    (op == RW_WRITER ? LB_SLEEP1 : LB_SLEEP2), slpcnt, slptime);
412 	LOCKSTAT_EVENT(lsflag, rw, LB_RWLOCK | LB_SPIN, spincnt, spintime);
413 	LOCKSTAT_EXIT(lsflag);
414 
415 	RW_DASSERT(rw, (op != RW_READER && RW_OWNER(rw) == curthread) ||
416 	    (op == RW_READER && RW_COUNT(rw) != 0));
417 	RW_LOCKED(rw, op);
418 }
419 
420 /*
421  * rw_vector_exit:
422  *
423  *	Release a rwlock.
424  */
425 void
426 rw_vector_exit(krwlock_t *rw)
427 {
428 	uintptr_t curthread, owner, decr, newown, next;
429 	turnstile_t *ts;
430 	int rcnt, wcnt;
431 	lwp_t *l;
432 
433 	curthread = (uintptr_t)curlwp;
434 	RW_ASSERT(rw, curthread != 0);
435 
436 	if (__predict_false(panicstr != NULL))
437 		return;
438 
439 	/*
440 	 * Again, we use a trick.  Since we used an add operation to
441 	 * set the required lock bits, we can use a subtract to clear
442 	 * them, which makes the read-release and write-release path
443 	 * the same.
444 	 */
445 	owner = rw->rw_owner;
446 	if (__predict_false((owner & RW_WRITE_LOCKED) != 0)) {
447 		RW_UNLOCKED(rw, RW_WRITER);
448 		RW_ASSERT(rw, RW_OWNER(rw) == curthread);
449 		decr = curthread | RW_WRITE_LOCKED;
450 	} else {
451 		RW_UNLOCKED(rw, RW_READER);
452 		RW_ASSERT(rw, RW_COUNT(rw) != 0);
453 		decr = RW_READ_INCR;
454 	}
455 
456 	/*
457 	 * Compute what we expect the new value of the lock to be. Only
458 	 * proceed to do direct handoff if there are waiters, and if the
459 	 * lock would become unowned.
460 	 */
461 	membar_exit();
462 	for (;;) {
463 		newown = (owner - decr);
464 		if ((newown & (RW_THREAD | RW_HAS_WAITERS)) == RW_HAS_WAITERS)
465 			break;
466 		next = rw_cas(rw, owner, newown);
467 		if (__predict_true(next == owner))
468 			return;
469 		owner = next;
470 	}
471 
472 	/*
473 	 * Grab the turnstile chain lock.  This gets the interlock
474 	 * on the sleep queue.  Once we have that, we can adjust the
475 	 * waiter bits.
476 	 */
477 	ts = turnstile_lookup(rw);
478 	owner = rw->rw_owner;
479 	RW_DASSERT(rw, ts != NULL);
480 	RW_DASSERT(rw, (owner & RW_HAS_WAITERS) != 0);
481 
482 	wcnt = TS_WAITERS(ts, TS_WRITER_Q);
483 	rcnt = TS_WAITERS(ts, TS_READER_Q);
484 
485 	/*
486 	 * Give the lock away.
487 	 *
488 	 * If we are releasing a write lock, then prefer to wake all
489 	 * outstanding readers.  Otherwise, wake one writer if there
490 	 * are outstanding readers, or all writers if there are no
491 	 * pending readers.  If waking one specific writer, the writer
492 	 * is handed the lock here.  If waking multiple writers, we
493 	 * set WRITE_WANTED to block out new readers, and let them
494 	 * do the work of acquiring the lock in rw_vector_enter().
495 	 */
496 	if (rcnt == 0 || decr == RW_READ_INCR) {
497 		RW_DASSERT(rw, wcnt != 0);
498 		RW_DASSERT(rw, (owner & RW_WRITE_WANTED) != 0);
499 
500 		if (rcnt != 0) {
501 			/* Give the lock to the longest waiting writer. */
502 			l = TS_FIRST(ts, TS_WRITER_Q);
503 			newown = (uintptr_t)l | RW_WRITE_LOCKED | RW_HAS_WAITERS;
504 			if (wcnt > 1)
505 				newown |= RW_WRITE_WANTED;
506 			rw_swap(rw, owner, newown);
507 			turnstile_wakeup(ts, TS_WRITER_Q, 1, l);
508 		} else {
509 			/* Wake all writers and let them fight it out. */
510 			rw_swap(rw, owner, RW_WRITE_WANTED);
511 			turnstile_wakeup(ts, TS_WRITER_Q, wcnt, NULL);
512 		}
513 	} else {
514 		RW_DASSERT(rw, rcnt != 0);
515 
516 		/*
517 		 * Give the lock to all blocked readers.  If there
518 		 * is a writer waiting, new readers that arrive
519 		 * after the release will be blocked out.
520 		 */
521 		newown = rcnt << RW_READ_COUNT_SHIFT;
522 		if (wcnt != 0)
523 			newown |= RW_HAS_WAITERS | RW_WRITE_WANTED;
524 
525 		/* Wake up all sleeping readers. */
526 		rw_swap(rw, owner, newown);
527 		turnstile_wakeup(ts, TS_READER_Q, rcnt, NULL);
528 	}
529 }
530 
531 /*
532  * rw_vector_tryenter:
533  *
534  *	Try to acquire a rwlock.
535  */
536 int
537 rw_vector_tryenter(krwlock_t *rw, const krw_t op)
538 {
539 	uintptr_t curthread, owner, incr, need_wait, next;
540 
541 	curthread = (uintptr_t)curlwp;
542 
543 	RW_ASSERT(rw, curthread != 0);
544 
545 	if (op == RW_READER) {
546 		incr = RW_READ_INCR;
547 		need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED;
548 	} else {
549 		RW_DASSERT(rw, op == RW_WRITER);
550 		incr = curthread | RW_WRITE_LOCKED;
551 		need_wait = RW_WRITE_LOCKED | RW_THREAD;
552 	}
553 
554 	for (owner = rw->rw_owner;; owner = next) {
555 		owner = rw->rw_owner;
556 		if (__predict_false((owner & need_wait) != 0))
557 			return 0;
558 		next = rw_cas(rw, owner, owner + incr);
559 		if (__predict_true(next == owner)) {
560 			/* Got it! */
561 			membar_enter();
562 			break;
563 		}
564 	}
565 
566 	RW_WANTLOCK(rw, op);
567 	RW_LOCKED(rw, op);
568 	RW_DASSERT(rw, (op != RW_READER && RW_OWNER(rw) == curthread) ||
569 	    (op == RW_READER && RW_COUNT(rw) != 0));
570 
571 	return 1;
572 }
573 
574 /*
575  * rw_downgrade:
576  *
577  *	Downgrade a write lock to a read lock.
578  */
579 void
580 rw_downgrade(krwlock_t *rw)
581 {
582 	uintptr_t owner, curthread, newown, next;
583 	turnstile_t *ts;
584 	int rcnt, wcnt;
585 
586 	curthread = (uintptr_t)curlwp;
587 	RW_ASSERT(rw, curthread != 0);
588 	RW_DASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) != 0);
589 	RW_ASSERT(rw, RW_OWNER(rw) == curthread);
590 	RW_UNLOCKED(rw, RW_WRITER);
591 #if !defined(DIAGNOSTIC)
592 	__USE(curthread);
593 #endif
594 
595 
596 	membar_producer();
597 	owner = rw->rw_owner;
598 	if ((owner & RW_HAS_WAITERS) == 0) {
599 		/*
600 		 * There are no waiters, so we can do this the easy way.
601 		 * Try swapping us down to one read hold.  If it fails, the
602 		 * lock condition has changed and we most likely now have
603 		 * waiters.
604 		 */
605 		next = rw_cas(rw, owner, RW_READ_INCR);
606 		if (__predict_true(next == owner)) {
607 			RW_LOCKED(rw, RW_READER);
608 			RW_DASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) == 0);
609 			RW_DASSERT(rw, RW_COUNT(rw) != 0);
610 			return;
611 		}
612 		owner = next;
613 	}
614 
615 	/*
616 	 * Grab the turnstile chain lock.  This gets the interlock
617 	 * on the sleep queue.  Once we have that, we can adjust the
618 	 * waiter bits.
619 	 */
620 	for (;; owner = next) {
621 		ts = turnstile_lookup(rw);
622 		RW_DASSERT(rw, ts != NULL);
623 
624 		rcnt = TS_WAITERS(ts, TS_READER_Q);
625 		wcnt = TS_WAITERS(ts, TS_WRITER_Q);
626 
627 		/*
628 		 * If there are no readers, just preserve the waiters
629 		 * bits, swap us down to one read hold and return.
630 		 */
631 		if (rcnt == 0) {
632 			RW_DASSERT(rw, wcnt != 0);
633 			RW_DASSERT(rw, (rw->rw_owner & RW_WRITE_WANTED) != 0);
634 			RW_DASSERT(rw, (rw->rw_owner & RW_HAS_WAITERS) != 0);
635 
636 			newown = RW_READ_INCR | RW_HAS_WAITERS | RW_WRITE_WANTED;
637 			next = rw_cas(rw, owner, newown);
638 			turnstile_exit(rw);
639 			if (__predict_true(next == owner))
640 				break;
641 		} else {
642 			/*
643 			 * Give the lock to all blocked readers.  We may
644 			 * retain one read hold if downgrading.  If there
645 			 * is a writer waiting, new readers will be blocked
646 			 * out.
647 			 */
648 			newown = (rcnt << RW_READ_COUNT_SHIFT) + RW_READ_INCR;
649 			if (wcnt != 0)
650 				newown |= RW_HAS_WAITERS | RW_WRITE_WANTED;
651 
652 			next = rw_cas(rw, owner, newown);
653 			if (__predict_true(next == owner)) {
654 				/* Wake up all sleeping readers. */
655 				turnstile_wakeup(ts, TS_READER_Q, rcnt, NULL);
656 				break;
657 			}
658 			turnstile_exit(rw);
659 		}
660 	}
661 
662 	RW_WANTLOCK(rw, RW_READER);
663 	RW_LOCKED(rw, RW_READER);
664 	RW_DASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) == 0);
665 	RW_DASSERT(rw, RW_COUNT(rw) != 0);
666 }
667 
668 /*
669  * rw_tryupgrade:
670  *
671  *	Try to upgrade a read lock to a write lock.  We must be the
672  *	only reader.
673  */
674 int
675 rw_tryupgrade(krwlock_t *rw)
676 {
677 	uintptr_t owner, curthread, newown, next;
678 
679 	curthread = (uintptr_t)curlwp;
680 	RW_ASSERT(rw, curthread != 0);
681 	RW_ASSERT(rw, rw_read_held(rw));
682 
683 	for (owner = rw->rw_owner;; owner = next) {
684 		RW_ASSERT(rw, (owner & RW_WRITE_LOCKED) == 0);
685 		if (__predict_false((owner & RW_THREAD) != RW_READ_INCR)) {
686 			RW_ASSERT(rw, (owner & RW_THREAD) != 0);
687 			return 0;
688 		}
689 		newown = curthread | RW_WRITE_LOCKED | (owner & ~RW_THREAD);
690 		next = rw_cas(rw, owner, newown);
691 		if (__predict_true(next == owner)) {
692 			membar_producer();
693 			break;
694 		}
695 	}
696 
697 	RW_UNLOCKED(rw, RW_READER);
698 	RW_WANTLOCK(rw, RW_WRITER);
699 	RW_LOCKED(rw, RW_WRITER);
700 	RW_DASSERT(rw, rw->rw_owner & RW_WRITE_LOCKED);
701 	RW_DASSERT(rw, RW_OWNER(rw) == curthread);
702 
703 	return 1;
704 }
705 
706 /*
707  * rw_read_held:
708  *
709  *	Returns true if the rwlock is held for reading.  Must only be
710  *	used for diagnostic assertions, and never be used to make
711  * 	decisions about how to use a rwlock.
712  */
713 int
714 rw_read_held(krwlock_t *rw)
715 {
716 	uintptr_t owner;
717 
718 	if (panicstr != NULL)
719 		return 1;
720 	if (rw == NULL)
721 		return 0;
722 	owner = rw->rw_owner;
723 	return (owner & RW_WRITE_LOCKED) == 0 && (owner & RW_THREAD) != 0;
724 }
725 
726 /*
727  * rw_write_held:
728  *
729  *	Returns true if the rwlock is held for writing.  Must only be
730  *	used for diagnostic assertions, and never be used to make
731  *	decisions about how to use a rwlock.
732  */
733 int
734 rw_write_held(krwlock_t *rw)
735 {
736 
737 	if (panicstr != NULL)
738 		return 1;
739 	if (rw == NULL)
740 		return 0;
741 	return (rw->rw_owner & (RW_WRITE_LOCKED | RW_THREAD)) ==
742 	    (RW_WRITE_LOCKED | (uintptr_t)curlwp);
743 }
744 
745 /*
746  * rw_lock_held:
747  *
748  *	Returns true if the rwlock is held for reading or writing.  Must
749  *	only be used for diagnostic assertions, and never be used to make
750  *	decisions about how to use a rwlock.
751  */
752 int
753 rw_lock_held(krwlock_t *rw)
754 {
755 
756 	if (panicstr != NULL)
757 		return 1;
758 	if (rw == NULL)
759 		return 0;
760 	return (rw->rw_owner & RW_THREAD) != 0;
761 }
762 
763 /*
764  * rw_owner:
765  *
766  *	Return the current owner of an RW lock, but only if it is write
767  *	held.  Used for priority inheritance.
768  */
769 static lwp_t *
770 rw_owner(wchan_t obj)
771 {
772 	krwlock_t *rw = (void *)(uintptr_t)obj; /* discard qualifiers */
773 	uintptr_t owner = rw->rw_owner;
774 
775 	if ((owner & RW_WRITE_LOCKED) == 0)
776 		return NULL;
777 
778 	return (void *)(owner & RW_THREAD);
779 }
780